In [1]:
import os
import time
import numpy as np 
import pandas as pd 
from tqdm import tqdm
tqdm.pandas()
import math
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Dense, Input, LSTM, CuDNNLSTM, Embedding, CuDNNGRU, MaxPool2D, Conv2D, Concatenate, SpatialDropout1D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Reshape, Flatten, Dropout, Activation
from keras.layers import Activation, Wrapper
from keras.engine.topology import Layer
from keras import backend as K

from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers


Using TensorFlow backend.


In [2]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
print("train shape : ", train.shape)
print("test shape : ", test.shape)

train shape :  (1306122, 3)
test shape :  (375806, 2)


In [3]:
test.head()

Unnamed: 0,qid,question_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arroga...
1,00002bd4fb5d505b9161,When should I apply for RV college of engineer...
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitio...
3,000086e4b7e1c7146103,Who are entrepreneurs?
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?


## load and create embedding matrix

In [4]:
def load_word_embedding(filepath):
    """
    Given a filepath to embeddings file, return a word to vec dictionary, 
    in other words, word_embedding
    Ex. {'word': array([1.97, -0.63, ..., 0.573, 2.54])}
    """
    def _get_vec(word, *arr):
        return word, np.asarray(arr, dtype='float32')

    print('load word embedding ......')
    
    try:
        word_embedding = dict(_get_vec(*w.split(' ')) for w in open(filepath))
    except UnicodeDecodeError:
        word_embedding = dict(_get_vec(*w.split(' ')) for w in open(
            filepath, encoding="utf8", errors='ignore'))
    
    # sanity check for word vectors must with lengths of 300
    words_to_del = []
    for word, vec in word_embedding.items():
        if len(vec) != 300:
            words_to_del.append(word)
    for word in words_to_del:
        del word_embedding[word]
    return word_embedding

### GloVe

In [5]:
EMBEDDING_FILE_glove = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
embeddings_index_glove = load_word_embedding(EMBEDDING_FILE_glove)

load word embedding ......


### Paragram

In [6]:
EMBEDDING_FILE_para = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
embeddings_index_para = load_word_embedding(EMBEDDING_FILE_para)

load word embedding ......


## preprocessing

In [7]:
import re
import string

In [8]:
regular_punct = list(string.punctuation)
extra_punct = [
        ',', '.', '"', ':', ')', '(', '!', '?', '|', ';', "'", '$', '&',
        '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
        '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',
        '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”',
        '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾',
        '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼', '⊕', '▼',
        '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲',
        'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»',
        '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø',
        '¹', '≤', '‡', '√', '«', '»', '´', 'º', '¾', '¡', '§', '£', '₤']

all_punct = list(set(regular_punct + extra_punct))

def spacing_punctuation(text):
    """
    add space before and after punctuation and symbols
    """
    for punc in all_punct:
        if punc in text:            
            text = text.replace(punc, f" {punc} ")
            
    return text

In [9]:
def clean_misspell(text):
    """
    misspell list (quora vs. glove)
    """
    
    mispell_dict = {'demonitisation': 'demonetization', 'demonitization': 'demonetization', 
                    'demonetisation': 'demonetization',
                    'pokémon': 'pokemon',
                    'Wjy': 'Why',
                    'Whst' : 'What',
                    'BNBR': 'Be Nice Be Respectful',
                    'Bolsonaro': 'Jair Bolsonaro',
                    'XXXTentacion': 'Tentacion',
                    'Žižek': 'Slovenian philosopher Slavoj Žižek',
                    'Adityanath': 'Indian monk Yogi Adityanath',
                    'Brexit': 'British Exit',
                    'Brexiter': 'British Exit supporter',
                    'Brexiters': 'British Exit supporters',
                    'Brexiteer': 'British Exit supporter',
                    'Brexiteers': 'British Exit supporters',
                    'Brexiting': 'British Exit',
                    'Brexitosis': 'British Exit disorder',
                    'brexit': 'British Exit',
                    'brexiters': 'British Exit supporters',
                    'cryptocurrencies': 'cryptocurrency',
                    'Cryptocurrency': 'cryptocurrency',
                    'Litecoin' : 'cryptocurrency',
                    'litecoin' : 'cryptocurrency',
                    'altcoin' : 'cryptocurrency',
                    'altcoins' : 'cryptocurrency',
                    'jallikattu': 'Jallikattu',
                    'Swachh': 'Swachh Bharat mission campaign ',
                    'SJWs': 'social justice warrior',
                    'Quorans': 'Quoran',
                    'Qoura': 'Quora',
                    'quoras': 'Quora',
                    'Quroa': 'Quora',
                    'QUORA': 'Quora',
                    'Qoura': 'Quora',
                    'narcissit': 'narcissist',
                    'ethereum': 'Ethereum',
                    'Blockchain': 'blockchain',
                    'UCEED': 'Undergraduate Common Entrance Examination for Design',
                    'GDPR': 'General Data Protection Regulation',
                    'Redmi' : 'Xiaomi smartphone',
                    'OnePlus': 'Android smartphone',
                    'Machedo' : 'hot guy',
                    'Coinbase':'bitcoin broker',
                    'coinbase':'bitcoin broker',
                    'DCEU' : 'American media franchise',
                    'IIEST': 'Indian Institutes of Engineering Science and Technology',
                    'Upwork' : 'global freelancing platform',
                    'upwork' : 'global freelancing platform',
                    'HackerRank' : 'technology company focuses on competitive programming challenges',
                    'pokémon': 'pokemon'}

    misspell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))

    def _replace(match):
        """
        reference: https://www.kaggle.com/hengzheng/attention-capsule-why-not-both-lb-0-694 # noqa
        """
        try:
            word = mispell_dict.get(match.group(0))
        except KeyError:
            word = match.group(0)
            print('!!Error: Could Not Find Key: {}'.format(word))
        return word
    return misspell_re.sub(_replace, text)

In [10]:
def preprocess(text):
    """
    preprocess text main steps

    """
    text.lower()
    text = spacing_punctuation(text)
    text = clean_misspell(text)
   
    return text

## modeling

In [11]:
## some config values 
embed_size = 300 # how big is each word vector
max_features = 150000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 75 # max number of words in a question to use

## tokenize text

In [12]:
## preprocess
train['question_text'] = train['question_text'].progress_apply(preprocess)
test['question_text'] = test['question_text'].progress_apply(preprocess)

## fill up the missing values
train_X = train["question_text"].fillna("_##_").values
test_X = test["question_text"].fillna("_##_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features) # glove

tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

## Get the target values
train_y = train['target'].values

#shuffling the data
np.random.seed(42)
trn_idx = np.random.permutation(len(train_X))

train_X = train_X[trn_idx]
train_y = train_y[trn_idx]

## word index dictionary from input text/document
word_index = tokenizer.word_index

100%|██████████| 1306122/1306122 [00:25<00:00, 51716.38it/s]
100%|██████████| 375806/375806 [00:07<00:00, 50165.39it/s]


## create embedding matrix

In [13]:
def create_embedding_weights(word_index, embeddings_index, max_features):
    '''
    input: word_index from training set and embedding from embedding file
    output: matrix with matched embedding arrays
    '''
    
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]
    nb_words = min(max_features, len(word_index))
    
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    
    # matching the word and updating the corresponding embedding vector  
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [14]:
embedding_matrix_glove = create_embedding_weights(word_index, embeddings_index_glove, max_features)
embedding_matrix_para = create_embedding_weights(word_index, embeddings_index_para, max_features)
embedding_matrix = np.mean([embedding_matrix_glove, embedding_matrix_para], axis = 0)
np.shape(embedding_matrix)

  import sys


(150000, 300)

In [15]:
def lstm_model(embedding_matrix):
    
    input_layer = Input(shape=(maxlen,))
    # create embedding layer
    x = Embedding(max_features, embed_size, weights = [embedding_matrix])(input_layer)
    x = SpatialDropout1D(0.2)(x)
    # bidirectional lstm
    x1 = Bidirectional(CuDNNLSTM(128, return_sequences=True), name='bidirectional_lstm')(x)
    # bidirectional gru
    x2 = Bidirectional(CuDNNGRU(128, return_sequences=True), name='bidirectional_gru')(x1)
    # attention
    #atten = Attention(step_dim=maxlen)(x1)
    # global_max_pooling1d

    max_pool1 = GlobalMaxPooling1D()(x1)
    max_pool2 = GlobalMaxPooling1D()(x2)
    x = Concatenate()([max_pool1, max_pool2])

    # output layer
    output_layer = Dense(1, activation="sigmoid", name = 'output')(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [16]:
# https://www.kaggle.com/suicaokhoailang/lstm-attention-baseline-0-652-lb

class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]
        
        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [17]:
# https://www.kaggle.com/strideradu/word2vec-and-gensim-go-go-go

# def train_pred(model, epochs=2):
#     for e in range(epochs):
#         model.fit(train_X, train_y, batch_size=512, epochs=1, validation_data=(val_X, val_y))
#         pred_val_y = model.predict([val_X], batch_size=1024, verbose=0)
#     pred_test_y = model.predict([test_X], batch_size=1024, verbose=0)
    
    
    # Plot training & validation accuracy values
#     plt.plot(model.history['acc'])
#     plt.plot(model.history['val_acc'])
#     plt.title('Model accuracy')
#     plt.ylabel('Accuracy')
#     plt.xlabel('Epoch')
#     plt.legend(['Train', 'Test'], loc='upper left')
#     plt.show()

#     # Plot training & validation loss values
#     plt.plot(model.history.history['loss'])
#     plt.plot(model.history.history['val_loss'])
#     plt.title('Model loss')
#     plt.ylabel('Loss')
#     plt.xlabel('Epoch')
#     plt.legend(['Train', 'Test'], loc='upper left')
#     plt.show()
#     return pred_val_y, pred_test_y



def train_pred(model, train_X, train_y, val_X, val_y, input_threshold, epochs=2):
    for epo in range(epochs):
        # train model
        model.fit(train_X, train_y, batch_size=512, epochs=1, validation_data=(val_X, val_y), verbose=0)
        # create validation set's predictions
        pred_val_y = model.predict([val_X], batch_size=1024, verbose=0)
        # check score of validation set
        best_score = metrics.f1_score(val_y, (pred_val_y > input_threshold).astype(int))
        print("Epoch: ", epo, "-    Val F1 Score: {:.4f}".format(best_score))
    # create test set's predictions 
    pred_test_y = model.predict([test_X], batch_size=1024, verbose=0)
    print('=' * 60)
    return pred_val_y, pred_test_y, best_score

## StratifiedKFold splits

In [18]:
best_thresh = 0.4
# placeholder for prediction vector
train_meta = np.zeros(train_y.shape[0])
test_meta = np.zeros(test_X.shape[0])

# create 4 folds, StratifiedKFold b/c imbalanced class
folds = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
splits = list(folds.split(X = train_X, y = train_y))

for idx, (train_idx, valid_idx) in enumerate(splits):
        X_train = train_X[train_idx]
        y_train = train_y[train_idx]
        X_val = train_X[valid_idx]
        y_val = train_y[valid_idx]
        
        model = lstm_model(embedding_matrix)
        # training the model
        pred_val_y, pred_test_y, best_score = train_pred(model, X_train, y_train, X_val, y_val, best_thresh, epochs = 3)
        # add the predictions to 
        train_meta[valid_idx] = pred_val_y.reshape(-1)
        # add the test set's predictions to test_meta
        test_meta += pred_test_y.reshape(-1) / len(splits)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch:  0 -    Val F1 Score: 0.6741
Epoch:  1 -    Val F1 Score: 0.6728
Epoch:  2 -    Val F1 Score: 0.6684
Epoch:  0 -    Val F1 Score: 0.6715
Epoch:  1 -    Val F1 Score: 0.6729
Epoch:  2 -    Val F1 Score: 0.6798
Epoch:  0 -    Val F1 Score: 0.6731
Epoch:  1 -    Val F1 Score: 0.6747
Epoch:  2 -    Val F1 Score: 0.6767
Epoch:  0 -    Val F1 Score: 0.6588
Epoch:  1 -    Val F1 Score: 0.6751
Epoch:  2 -    Val F1 Score: 0.6754


## the final F1 score of the entire test set

In [19]:
metrics.f1_score(y_true=train_y, y_pred=train_meta > best_thresh)

0.6751423333011677

In [20]:
final_test_y = (test_meta > best_thresh).astype(int)
test_df = pd.read_csv("../input/test.csv", usecols=["qid"])
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = final_test_y
out_df.to_csv("submission.csv", index=False)