In [76]:
"""
Fork of https://www.kaggle.com/antmarakis/bi-lstm-conv-layer?scriptVersionId=2789290
Just replaced the data with Preprocessed data
Public LB score 0.9833 => 0.9840

"""

import numpy as np
import pandas as pd
from keras.layers import Dense, Input, LSTM, Bidirectional, Conv1D, CuDNNLSTM
from keras.layers import Dropout, Embedding
from keras.preprocessing import text, sequence
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, SpatialDropout1D
from keras.models import Model
from unidecode import unidecode
import re
import xgboost as xgb
import gc

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [3]:
special_character_removal=re.compile(r'[^a-z\?\!\#\@\%\* ]',re.IGNORECASE)
def clean_text(x):
    x_ascii = unidecode(x)
    x_clean = special_character_removal.sub('',x_ascii)
    return x_clean

train['clean_text'] = train['comment_text'].apply(lambda x: clean_text(str(x)))
test['clean_text'] = test['comment_text'].apply(lambda x: clean_text(str(x)))

In [4]:
max_features=100000
maxlen=150
embed_size=300

# max_features=50000
# maxlen=900
# embed_size=301

train_y = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
train_x = train['clean_text']#.str.lower()

test_x = test['clean_text']#.str.lower()

In [5]:
# Vectorize text + Prepare fasttext Embedding
tokenizer = text.Tokenizer(num_words=max_features, lower=True)
tokenizer.fit_on_texts(list(train_x))

train_x = tokenizer.texts_to_sequences(train_x)
test_x = tokenizer.texts_to_sequences(test_x)

train_x = sequence.pad_sequences(train_x, maxlen=maxlen)
test_x = sequence.pad_sequences(test_x, maxlen=maxlen)

In [6]:
# for i in train_x[0]:
#     print(list(tokenizer.word_index.keys())[list(tokenizer.word_index.values()).index(i)])

In [6]:
EMBEDDING_FILE="../data/fasttext/crawl-300d-2M.vec"
embeddings_index = {}
with open(EMBEDDING_FILE, encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

word_index = tokenizer.word_index
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))

for word, i in word_index.items():
    if i >= max_features:
        continue
    
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [15]:
# Build Model
inp = Input(shape=(maxlen,))

embedding_layer = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
x = SpatialDropout1D(0.35)(embedding_layer)

x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
#x = Dropout(0.3)(x) #dropout doesnt improve
x = Conv1D(64, kernel_size=3, padding='valid', kernel_initializer='glorot_uniform')(x)

avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool])

out = Dense(6, activation='sigmoid')(x)

model = Model(inp, out)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [55]:
embeddings = model.layers[1].get_weights()[0]

# or access the embedding layer through the constructed model 
# first `0` refers to the position of embedding layer in the `model`
embeddings = model.layers[1].get_weights()[0]

# `embeddings` has a shape of (num_vocab, embedding_dim) 

# `word_to_index` is a mapping (i.e. dict) from words to their index, e.g. `love`: 69
words_embeddings = {w:embeddings[idx] for w, idx in word_index.items()  if idx < 100000}

# now you can use it like this for example
print(words_embeddings['love'])  # possible output: [0.21, 0.56, ..., 0.65, 0.10]

[-0.2757 -0.0343  0.1668  0.0358 -0.0805 -0.0105  0.0752  0.324   0.1245
 -0.002  -0.0176  0.3054 -0.0302  0.2219 -0.1233 -0.1776 -0.3783 -0.0099
 -0.0945  0.1197 -0.0771 -0.2172 -0.0581 -0.2592 -0.0912  0.207   0.0356
  0.1817 -0.1424  0.0026 -0.2029 -0.0429  0.0164 -0.3866 -0.0228 -0.1913
  0.025   0.0919  0.1341  0.28   -0.1914 -0.0225  0.0942 -0.0417 -0.0278
 -0.0698 -0.1828 -0.0449  0.0688  0.14   -0.0579 -0.1856  0.1317  0.0861
 -0.2081 -0.0542  0.0502 -0.0092 -0.1887 -0.0216  0.0347 -0.269  -0.1502
  0.3031  0.0336  0.159   0.1326 -0.1242 -0.1914 -0.2266  0.1209 -0.0246
 -0.1972  0.3093 -0.0582  0.1337 -0.0827 -0.0721  0.3924 -0.2054  0.2582
  0.18   -0.2151  0.0502 -0.3227  0.0237 -0.0227  0.2881  0.0154 -0.1839
 -0.1311 -0.0507 -0.157   0.1108 -0.168  -0.3899 -0.0335 -0.0088 -0.1911
  0.1927 -0.0023  0.105   0.0059  0.2258  0.5905 -0.1266 -0.1993 -0.2283
 -0.274  -0.1581 -0.2229 -0.1595  0.1269  0.0143 -0.7361  0.0523  0.2621
  0.0669 -0.1305 -0.0139  0.0975 -0.3305 -0.0738  0

In [61]:
#train_x[0]
def sent2vec(s):
    M = []
    for index in train_x[0]:
        if index != 0:
            M.append(words_embeddings[res[index]])
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
           return np.zeros(300)
    return(v / np.sqrt((v ** 2).sum()))

In [66]:
xtrain_fasttext = [sent2vec(x) for x in train_x]
xtest_fasttext = [sent2vec(x) for x in test_x]

In [70]:
xtrain_fasttext = np.array(xtrain_fasttext)
xtest_fasttext = np.array(xtest_fasttext)

In [82]:
def runXGB(train_X, train_y, seed_val=2017, num_rounds=500):
    param = {}
    param['objective'] = 'binary:logistic'
    #param['booster'] = "gbtree", 
    param['eta'] = 0.2
    param['max_depth'] = 3
    param['silent'] = 1
    param['eval_metric'] = 'auc'
    param['min_child_weight'] = 4
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

#     if test_y is not None:
#         xgtest = xgb.DMatrix(test_X, label=test_y)
#         watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
#         model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
#     else:
        #xgtest = xgb.DMatrix(test_X)
    model = xgb.train(plst, xgtrain, num_rounds)

    return model  

In [83]:
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((test.shape[0], len(col)))

for i, j in enumerate(col):
    print('fit '+j)
    model = runXGB(xtrain_fasttext, train_y[:,i])
    preds[:,i] = model.predict(xgb.DMatrix(xtest_fasttext), ntree_limit = model.best_ntree_limit)
    gc.collect()

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


In [85]:
preds[:10,:]

array([[0.09606097, 0.00993686, 0.05294036, 0.00299561, 0.04932525,
        0.00879266],
       [0.09606097, 0.00993686, 0.05294036, 0.00299561, 0.04932525,
        0.00879266],
       [0.09606097, 0.00993686, 0.05294036, 0.00299561, 0.04932525,
        0.00879266],
       [0.09606097, 0.00993686, 0.05294036, 0.00299561, 0.04932525,
        0.00879266],
       [0.09606097, 0.00993686, 0.05294036, 0.00299561, 0.04932525,
        0.00879266],
       [0.09606097, 0.00993686, 0.05294036, 0.00299561, 0.04932525,
        0.00879266],
       [0.09606097, 0.00993686, 0.05294036, 0.00299561, 0.04932525,
        0.00879266],
       [0.09606097, 0.00993686, 0.05294036, 0.00299561, 0.04932525,
        0.00879266],
       [0.09606097, 0.00993686, 0.05294036, 0.00299561, 0.04932525,
        0.00879266],
       [0.09606097, 0.00993686, 0.05294036, 0.00299561, 0.04932525,
        0.00879266]])

In [75]:
# Fitting a simple xgboost on glove features
clf = xgb.XGBClassifier(objective = "binary:logistic", 
          booster = "gbtree", 
          eval_metric = "auc", 
          #nthread = 4, 
          eta = 0.2, 
          max_depth = 3,
          min_child_weight = 4,
          subsample = 0.7,
          colsample_bytree = 0.7, silent=False)
clf.fit(xtrain_fasttext, train_y)

ValueError: bad input shape (159571, 6)

In [7]:
# Prediction
batch_size = 512
epochs = 5

model.fit(train_x, train_y, batch_size=batch_size, epochs=epochs, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fd119439860>

In [8]:
predictions = model.predict(test_x, batch_size=batch_size, verbose=1)



In [9]:
# Create submission file
output=pd.DataFrame(data=predictions, index=test["id"])
output.to_csv("./output/bilstm_conv_embedding_matrix.csv",header=["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
              ,index=True)