In [1]:
import os
import time
import numpy as np 
import pandas as pd 
from tqdm import tqdm
tqdm.pandas()
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Dense, Input, LSTM, CuDNNLSTM, Embedding, CuDNNGRU, MaxPool2D, Conv2D, Concatenate, SpatialDropout1D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Reshape, Flatten, Dropout, Activation
from keras.layers import Activation, Wrapper
from keras.engine.topology import Layer
from keras import backend as K

from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers


Using TensorFlow backend.


In [2]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
print("train_original shape : ",train.shape)
print("test_original shape : ",test.shape)

train_original shape :  (1306122, 3)
test_original shape :  (375806, 2)


## load and create embedding matrix

In [3]:
def load_word_embedding(filepath):
    """
    Given a filepath to embeddings file, return a word to vec dictionary, 
    in other words, word_embedding
    Ex. {'word': array([1.97, -0.63, ..., 0.573, 2.54])}
    """
    def _get_vec(word, *arr):
        return word, np.asarray(arr, dtype='float32')

    print('load word embedding ......')
    
    try:
        word_embedding = dict(_get_vec(*w.split(' ')) for w in open(filepath))
    except UnicodeDecodeError:
        word_embedding = dict(_get_vec(*w.split(' ')) for w in open(
            filepath, encoding="utf8", errors='ignore'))
    
    # sanity check for word vectors must with lengths of 300
    words_to_del = []
    for word, vec in word_embedding.items():
        if len(vec) != 300:
            words_to_del.append(word)
    for word in words_to_del:
        del word_embedding[word]
    return word_embedding

In [4]:
EMBEDDING_FILE_glove = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
embeddings_index_glove = load_word_embedding(EMBEDDING_FILE_glove)

load word embedding ......


## preprocessing

In [5]:
import re
import string

In [6]:
regular_punct = list(string.punctuation)
extra_punct = [
        ',', '.', '"', ':', ')', '(', '!', '?', '|', ';', "'", '$', '&',
        '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
        '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',
        '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”',
        '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾',
        '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼', '⊕', '▼',
        '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲',
        'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»',
        '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø',
        '¹', '≤', '‡', '√', '«', '»', '´', 'º', '¾', '¡', '§', '£', '₤']

all_punct = list(set(regular_punct + extra_punct))

def spacing_punctuation(text):
    """
    add space before and after punctuation and symbols
    """
    for punc in all_punct:
        if punc in text:            
            text = text.replace(punc, f" {punc} ")
            
    return text

In [7]:
def clean_misspell(text):
    """
    misspell list (quora vs. glove)
    """
    
    mispell_dict = {'demonitisation': 'demonetization', 'demonitization': 'demonetization', 
                    'demonetisation': 'demonetization',
                    'pokémon': 'pokemon',
                    'Wjy': 'Why',
                    'Whst' : 'What',
                    'BNBR': 'Be Nice Be Respectful',
                    'Bolsonaro': 'Jair Bolsonaro',
                    'XXXTentacion': 'Tentacion',
                    'Žižek': 'Slovenian philosopher Slavoj Žižek',
                    'Adityanath': 'Indian monk Yogi Adityanath',
                    'Brexit': 'British Exit',
                    'Brexiter': 'British Exit supporter',
                    'Brexiters': 'British Exit supporters',
                    'Brexiteer': 'British Exit supporter',
                    'Brexiteers': 'British Exit supporters',
                    'Brexiting': 'British Exit',
                    'Brexitosis': 'British Exit disorder',
                    'brexit': 'British Exit',
                    'brexiters': 'British Exit supporters',
                    'cryptocurrencies': 'cryptocurrency',
                    'Cryptocurrency': 'cryptocurrency',
                    'Litecoin' : 'cryptocurrency',
                    'litecoin' : 'cryptocurrency',
                    'altcoin' : 'cryptocurrency',
                    'altcoins' : 'cryptocurrency',
                    'jallikattu': 'Jallikattu',
                    'Swachh': 'Swachh Bharat mission campaign ',
                    'SJWs': 'social justice warrior',
                    'Quorans': 'Quoran',
                    'Qoura': 'Quora',
                    'quoras': 'Quora',
                    'Quroa': 'Quora',
                    'QUORA': 'Quora',
                    'Qoura': 'Quora',
                    'narcissit': 'narcissist',
                    'ethereum': 'Ethereum',
                    'Blockchain': 'blockchain',
                    'UCEED': 'Undergraduate Common Entrance Examination for Design',
                    'GDPR': 'General Data Protection Regulation',
                    'Redmi' : 'Xiaomi smartphone',
                    'OnePlus': 'Android smartphone',
                    'Machedo' : 'hot guy',
                    'Coinbase':'bitcoin broker',
                    'coinbase':'bitcoin broker',
                    'DCEU' : 'American media franchise',
                    'IIEST': 'Indian Institutes of Engineering Science and Technology',
                    'Upwork' : 'global freelancing platform',
                    'upwork' : 'global freelancing platform',
                    'HackerRank' : 'technology company focuses on competitive programming challenges',
                    'pokémon': 'pokemon'}

    misspell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))

    def _replace(match):
        """
        reference: https://www.kaggle.com/hengzheng/attention-capsule-why-not-both-lb-0-694 # noqa
        """
        try:
            word = mispell_dict.get(match.group(0))
        except KeyError:
            word = match.group(0)
            print('!!Error: Could Not Find Key: {}'.format(word))
        return word
    return misspell_re.sub(_replace, text)

In [8]:
def preprocess(text):
    """
    preprocess text main steps

    """
    text = spacing_punctuation(text)
    text = clean_misspell(text)
   
    return text

## modeling

In [9]:
## some config values 
embed_size = 300 # how big is each word vector
max_features = 150000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 75 # max number of words in a question to use

In [10]:
## preprocess
train['question_text'] = train['question_text'].progress_apply(preprocess)
test['question_text'] = test['question_text'].progress_apply(preprocess)

## split to train and val
train_df, val_df = train_test_split(train, test_size=0.05, random_state=42)

## fill up the missing values
train_X = train_df["question_text"].fillna("_##_").values
val_X = val_df["question_text"].fillna("_##_").values
test_X = test["question_text"].fillna("_##_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features) # glove

tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

## Get the target values
train_y = train_df['target'].values
val_y = val_df['target'].values  

#shuffling the data
np.random.seed(42)
trn_idx = np.random.permutation(len(train_X))
val_idx = np.random.permutation(len(val_X))

train_X = train_X[trn_idx]
val_X = val_X[val_idx]
train_y = train_y[trn_idx]
val_y = val_y[val_idx] 

## word index dictionary from input text/document
word_index = tokenizer.word_index

100%|██████████| 1306122/1306122 [00:27<00:00, 47162.78it/s]
100%|██████████| 375806/375806 [00:07<00:00, 50184.97it/s]


## create embedding matrix

In [11]:
def create_embedding_weights(word_index, embeddings_index, max_features):
    '''
    input: word_index from training set and embedding from embedding file
    output: matrix with matched embedding arrays
    '''
    
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]
    nb_words = min(max_features, len(word_index))
    
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    
    # matching the word and updating the corresponding embedding vector  
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [12]:
embedding_matrix = create_embedding_weights(word_index, embeddings_index_glove, max_features)

  import sys


In [13]:
def lstm_model(embedding_matrix):
    
    input_layer = Input(shape=(maxlen,))
    # create embedding layer
    x = Embedding(max_features, embed_size, weights = [embedding_matrix])(input_layer)
    x = SpatialDropout1D(0.2)(x)
    # bidirectional lstm
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True), name='bidirectional_lstm')(x)
    # bidirectional gru
    x = Bidirectional(CuDNNGRU(64, return_sequences=True), name='bidirectional_gru')(x)
    # global_max_pooling1d
    x = GlobalMaxPooling1D()(x)
    x = Dense(32, activation="relu", name = 'dense_1')(x)
    x = Dense(16, activation="relu", name = 'dense_2')(x)
    # output layer
    output_layer = Dense(1, activation="sigmoid", name = 'output')(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [14]:
# https://www.kaggle.com/strideradu/word2vec-and-gensim-go-go-go

def train_pred(model, epochs=2):
    for e in range(epochs):
        model.fit(train_X, train_y, batch_size=512, epochs=1, validation_data=(val_X, val_y))
        pred_val_y = model.predict([val_X], batch_size=1024, verbose=0)
    pred_test_y = model.predict([test_X], batch_size=1024, verbose=0)
    
    
    # Plot training & validation accuracy values
#     plt.plot(model.history['acc'])
#     plt.plot(model.history['val_acc'])
#     plt.title('Model accuracy')
#     plt.ylabel('Accuracy')
#     plt.xlabel('Epoch')
#     plt.legend(['Train', 'Test'], loc='upper left')
#     plt.show()

#     # Plot training & validation loss values
#     plt.plot(model.history.history['loss'])
#     plt.plot(model.history.history['val_loss'])
#     plt.title('Model loss')
#     plt.ylabel('Loss')
#     plt.xlabel('Epoch')
#     plt.legend(['Train', 'Test'], loc='upper left')
#     plt.show()
    return pred_val_y, pred_test_y

In [15]:
model = lstm_model(embedding_matrix)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [16]:
pred_val_y, pred_test_y = train_pred(model, epochs = 2) # GloVe only

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 1240815 samples, validate on 65307 samples
Epoch 1/1
Train on 1240815 samples, validate on 65307 samples
Epoch 1/1


In [17]:
thresholds = []
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    res = metrics.f1_score(val_y, (pred_val_y > thresh).astype(int))
    thresholds.append([thresh, res])
    print("F1 score at threshold {0} is {1}".format(thresh, res))
    
thresholds.sort(key=lambda x: x[1], reverse=True)
best_thresh = thresholds[0][0]
print("Best threshold: ", best_thresh)

F1 score at threshold 0.1 is 0.5978407557354926
F1 score at threshold 0.11 is 0.606505047019239
F1 score at threshold 0.12 is 0.6145704709825366
F1 score at threshold 0.13 is 0.6221463239259392
F1 score at threshold 0.14 is 0.6281218552739913
F1 score at threshold 0.15 is 0.6321166527352094
F1 score at threshold 0.16 is 0.6358926048045219
F1 score at threshold 0.17 is 0.6396714735937352
F1 score at threshold 0.18 is 0.6437137330754352
F1 score at threshold 0.19 is 0.6479701902333792
F1 score at threshold 0.2 is 0.6511027220345719
F1 score at threshold 0.21 is 0.6539350688511408
F1 score at threshold 0.22 is 0.6565133516093006
F1 score at threshold 0.23 is 0.6606446314925067
F1 score at threshold 0.24 is 0.6620375168411234
F1 score at threshold 0.25 is 0.6644295302013423
F1 score at threshold 0.26 is 0.6647588765235823
F1 score at threshold 0.27 is 0.6671664167916043
F1 score at threshold 0.28 is 0.6702656013819909
F1 score at threshold 0.29 is 0.6716092455298736
F1 score at threshold 0

In [18]:
pred_test_y = (pred_test_y > best_thresh).astype(int)
test_df = pd.read_csv("../input/test.csv", usecols=["qid"])
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = pred_test_y
out_df.to_csv("submission.csv", index=False)