In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
########################################
## import packages
########################################
import os
import re
import csv
import sys
import codecs
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.metrics import roc_auc_score

from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Flatten, Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model,load_model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from keras.layers import Bidirectional, GlobalMaxPool1D,GlobalAveragePooling1D ,Conv1D, MaxPooling1D, GRU, CuDNNGRU
from keras.optimizers import RMSprop, SGD
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints

Using TensorFlow backend.


In [2]:
class Attention(Layer):
    def __init__(self, step_dim=70,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
        """
        self.supports_masking = True
        #self.init = initializations.get('glorot_uniform')
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)
    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        # eij = K.dot(x, self.W) TF backend doesn't support it

        # features_dim = self.W.shape[0]
        # step_dim = x._keras_shape[1]

        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        #print weigthted_input.shape
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        #return input_shape[0], input_shape[-1]
        return input_shape[0],  self.features_dim

In [3]:
# All submission files were downloaded from different public kernels
# See the description to see the source of each submission file
model_path = "../model/"
sub_path = "D:/OneDrive/Documentos/Toxic/sub/"
all_files = os.listdir(model_path)
print (all_files)

['model.hdf5', 'tokenizer.pickle']


In [4]:
print (model_path)
#load_model(model_path+"model.hdf5")
#model = load_model(model_path+"model.hdf5", custom_objects={'Attention':Attention()})
model=load_model(model_path+"model.hdf5")

../model/


In [5]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 70)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 70, 300)           28500300  
_________________________________________________________________
lstm_1 (LSTM)                (None, 70, 100)           160400    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                5050      
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 51        
Total para

In [None]:
# Read and concatenate submissions
models = [ load_model(model_path+f) \
          for f in all_files]

In [6]:
def merge_several_folds_mean(data, nfolds):
    a = np.array(data[0])
    for i in range(1, nfolds):
        a += np.array(data[i])
    a /= nfolds
    return a.tolist()

In [None]:
test_res = merge_several_folds_mean(yfull_test, nfolds)
 

In [7]:
########################################
## process texts in datasets
########################################
import re
def remove_urls (vTEXT):
    vTEXT = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', vTEXT, flags=re.MULTILINE)
    return(vTEXT)

def ReplaceThreeOrMore(s):
    # pattern to look for three or more repetitions of any character, including
    # newlines.
    pattern = re.compile(r"(.)\1{2,}", re.DOTALL) 
    return pattern.sub(r"\1", s)

def splitstring(s):
    # searching the number of characters to split on
    proposed_pattern = s[0]
    for i, c in enumerate(s[1:], 1):
        if c != " ":
            if proposed_pattern == s[i:(i+len(proposed_pattern))]:
                # found it
                break
            else:
                proposed_pattern += c
    else:
        exit(1)

    return proposed_pattern

def clean_numbers(x):

    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x
#Regex to remove all Non-Alpha Numeric and space
special_character_removal=re.compile(r'[^a-z\d ]',re.IGNORECASE)

#regex to replace all numerics
replace_numbers=re.compile(r'\d+',re.IGNORECASE)

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'colour':'color',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'isnt':'is not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social medium',
                'whatsapp': 'social medium',
                'snapchat': 'social medium'

                }
mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

def clean_text(x):

    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

def text_to_wordlist(text,to_lower=False, rem_urls=False, rem_3plus=False,
                     clean_t=True, clean_num=True,mispelling=True,rem_specwords= False,
                     split_repeated=True, rem_special=False, rep_num=False, 
                     man_adj=True, rem_stopwords=False, stem_snowball=False,
                     stem_porter=False, lemmatize=False):

    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    if rem_urls:
        text = remove_urls(text)
    if to_lower:    
        text = text.lower()
    if rem_3plus:    
        text = ReplaceThreeOrMore(text)
        
    if clean_t:
        text= clean_text(text)
        
    if clean_num:
        text= clean_numbers(text)    
        
    if mispelling:
        text= replace_typical_misspell(text)

    if man_adj: 
        # Clean the text
        text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
        text = re.sub(r"what's", "what is ", text)
        text = re.sub(r"\'s", " ", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"can't", "cannot ", text)
        text = re.sub(r"n't", " not ", text)
        text = re.sub(r"i'm", "i am ", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " would ", text)
        text = re.sub(r"\'ll", " will ", text)
        text = re.sub(r",", " ", text)
        text = re.sub(r"\.", " ", text)
        text = re.sub(r"!", " ! ", text)
        text = re.sub(r"\/", " ", text)
        text = re.sub(r"\^", " ^ ", text)
        text = re.sub(r"\+", " + ", text)
        text = re.sub(r"\-", " - ", text)
        text = re.sub(r"\=", " = ", text)
        text = re.sub(r"'", " ", text)
        text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
        text = re.sub(r":", " : ", text)
        text = re.sub(r" e g ", " eg ", text)
        text = re.sub(r" b g ", " bg ", text)
        text = re.sub(r" u s ", " american ", text)
        text = re.sub(r"\0s", "0", text)
        text = re.sub(r" 9 11 ", "911", text)
        text = re.sub(r"e - mail", "email", text)
        text = re.sub(r"j k", "jk", text)
        text = re.sub(r"\s{2,}", " ", text)

    # split them into a list
    text = text.split()
    
    if split_repeated:
        for i, c in enumerate(text):
            text[i]=splitstring(c)
    
    if rem_specwords:    
        to_remove = ['a','to','of','and']
        text = [w for w in text if not w in to_remove]
        
    # Optionally, remove stop words
    if rem_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)
    
    #Remove Special Characters
    if rem_special: 
        text=special_character_removal.sub('',text)
    
    #Replace Numbers
    if rep_num:     
        text=replace_numbers.sub('n',text)

    # Optionally, shorten words to their stems
    if stem_snowball:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    if stem_porter:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in text.split()])
        
    if lemmatize:
        wordnet_lemmatizer = WordNetLemmatizer()
        txt = " ".join([wordnet_lemmatizer.lemmatize(w) for w in text.split()])   
 
    # Return a list of words
    return(text)

In [8]:
import pickle
def save_tokenizer( file):
    # saving
    with open(file, 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_tokenizer( file ):
    # loading
    with open(file, 'rb') as handle:
        tokenizer = pickle.load(handle)
    return tokenizer

In [39]:

maxlen=70
max_features=95000
## Tokenize the sentences
#tokenizer = Tokenizer(num_words=max_features)
print (model_path+'tokenizer.pickle')
tokenizer=load_tokenizer(model_path+'tokenizer.pickle')

../model/tokenizer.pickle


In [78]:
def prepare_question(question, PreProcess=False):
    global tokenizer
    print(question)

    questions=[question]
    
    print (questions)
    if PreProcess:
        prep_questions = [text_to_wordlist(questions)]
    else:
        prep_questions = questions
    print ("_________________")
    print (prep_questions)    
    question_ = tokenizer.texts_to_sequences(prep_questions)
    print (question_)
    ## Pad the sentences 
    questions_pad = pad_sequences(question_, maxlen=maxlen)
    print (questions_pad)
    #print(len(tokenizer.word_index))
    #print(len(tokenizer.word_counts))
    
    return questions_pad

In [121]:
q=prepare_question("WHAT IS THE biggest country in the world?",False)

WHAT IS THE biggest country in the world?
['WHAT IS THE biggest country in the world?']
_________________
['WHAT IS THE biggest country in the world?']
[[2, 3, 1, 643, 130, 6, 1, 95]]
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   2   3   1 643 130   6   1  95]]


In [122]:
q1=prepare_question("What is the biggest country in the world?",True)

What is the biggest country in the world?
['What is the biggest country in the world?']
_________________
['What is the biggest country in the world']
[[2, 3, 1, 643, 130, 6, 1, 95]]
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   2   3   1 643 130   6   1  95]]


In [123]:
test_prediction = model.predict(q, batch_size=512, verbose=1)
test_prediction1 = model.predict(q1, batch_size=512, verbose=1)



In [118]:
print (test_prediction,test_prediction1)

[[0.00126187]] [[0.00126187]]


In [91]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

Train shape :  (1306122, 3)
Test shape :  (56370, 2)


In [92]:
## split to train and val
#train_fit, train_val = train_test_split(train_df, test_size=0.08, random_state=2018)
train_X=train_df["question_text"].values
test_X=test_df["question_text"].values

In [114]:
print (train_X[1:10])
    

['Do you have an adopted dog, how would you encourage people to adopt and not shop?'
 'Why does velocity affect time? Does velocity affect space geometry?'
 'How did Otto von Guericke used the Magdeburg hemispheres?'
 'Can I convert montra helicon D to a mountain bike by just changing the tyres?'
 'Is Gaza slowly becoming Auschwitz, Dachau or Treblinka for Palestinians?'
 'Why does Quora automatically ban conservative opinions when reported, but does not do the same for liberal views?'
 'Is it crazy if I wash or wipe my groceries off? Germs are everywhere.'
 'Is there such a thing as dressing moderately, and if so, how is that different than dressing modestly?'
 'Is it just me or have you ever been in this phase wherein you became ignorant to the people you once loved, completely disregarding their feelings/lives so you get to have something go your way and feel temporarily at ease. How did things change?']


In [95]:
train_questions = []
for text in train_X:
    train_questions.append(text_to_wordlist(text)) 

In [112]:
print (np.array(train_questions[1:10]))


['Do you have an adopted dog how would you encourage people to adopt and not shop'
 'Why does velocity affect time Does velocity affect space geometry'
 'How did Otto von Guericke used the Magdeburg hemispheres'
 'Can I convert montra helicon D to a mountain bike by just changing the tyres'
 'Is Gaza slowly becoming Auschwitz Dachau or Treblinka for Palestinians'
 'Why does Quora automatically ban conservative opinions when reported but does not do the same for liberal views'
 'Is it crazy if I wash or wipe my groceries off Germs are everywhere'
 'Is there such a thing as dressing moderately and if so how is that different than dressing modestly'
 'Is it just me or have you ever been in this phase wherein you became ignorant to the people you once loved completely disregarding their feelings lives so you get to have something go your way and feel temporarily at ease How did things change']


In [97]:
train_tokens = tokenizer.texts_to_sequences(train_questions)

In [125]:
print (np.array(train_tokens[1:10]))

[list([11, 14, 24, 30, 3716, 466, 9, 36, 14, 3624, 38, 5, 3043, 10, 43, 1806])
 list([16, 26, 1979, 371, 72, 26, 1979, 371, 447, 5416])
 list([9, 50, 12458, 8087, 50016, 121, 1, 38804, 26838])
 list([15, 7, 1112, 41663, 91364, 563, 5, 4, 3015, 1499, 49, 97, 1452, 1, 9106])
 list([3, 8995, 3868, 763, 18687, 45334, 23, 91365, 13, 2572])
 list([16, 26, 107, 2493, 1623, 1443, 2372, 33, 4219, 66, 26, 43, 11, 1, 140, 13, 1130, 891])
 list([3, 17, 1927, 20, 7, 2893, 23, 5773, 18, 12228, 222, 12015, 12, 3613])
 list([3, 40, 204, 4, 184, 37, 6372, 19024, 10, 20, 54, 9, 3, 19, 129, 73, 6372, 24411])
 list([3, 17, 97, 56, 23, 24, 14, 92, 117, 6, 65, 2134, 14977, 14, 1339, 3101, 5, 1, 38, 14, 652, 1691, 895, 18338, 57, 1085, 1032, 54, 14, 34, 5, 24, 192, 109, 29, 81, 10, 103, 6876, 46, 6462, 9, 50, 146, 166])]


In [None]:


    
    ## Tokenize the sentences
    tokenizer = Tokenizer(num_words=max_features)
    if PreProcess:
   

    else:
        tokenizer.fit_on_texts(list(train_X)+list(test_X))
        train_X = tokenizer.texts_to_sequences(train_X)
        test_X = tokenizer.texts_to_sequences(test_X)

    print(len(train_X), 'train sequences')
    print(len(test_X), 'test sequences')
    print('Average train sequence length: {}'.format(np.mean(list(map(len, train_X)), dtype=int)))
    print('Average test sequence length: {}'.format(np.mean(list(map(len, test_X)), dtype=int)))
    print('Max train sequence length: {}'.format(np.max(list(map(len, train_X)))))
    print('Max test sequence length: {}'.format(np.max(list(map(len, test_X)))))  
    
    ## Pad the sentences 
    train_X = pad_sequences(train_X, maxlen=maxlen)
    test_X = pad_sequences(test_X, maxlen=maxlen)

    ## Get the target values
    train_y = train_df['target'].values
  
    print(len(tokenizer.word_index))
    print(len(tokenizer.word_counts))
    
    return train_X, test_X, train_y, test_df, tokenizer.word_index