## Key-Value Attention for Text Classification

Inspired by

- [Frustratingly Short Attention Spans in Neural Language Modeling](https://github.com/arXivTimes/arXivTimes/issues/215) 
- [Hierarchical Attention Networks for Document Classification](https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf)


code for the Kaggle competition: [**Toxic Comment Classification Challenge**](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)

In [None]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers import LSTM, Bidirectional, Dense,Merge,RepeatVector,Multiply,Lambda
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras.layers.core import Activation, Dense
from keras.layers.wrappers import TimeDistributed
import keras.backend as K

from keras.layers.merge import Concatenate

import warnings
warnings.filterwarnings('ignore')


In [None]:
EMBEDDING_FILE = 'data/glove.6B.100d.txt'

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/sample_submission.csv')

X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values


In [None]:

max_features = 30000
maxlen = 100
embed_size = 100


In [None]:

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)


def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector



In [None]:

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


In [None]:
# kerasを用いた実装

hid_dim=256  #2の倍数
att_dim = 32 # Attentionの重みの計算の際の次元

inp = Input(shape=(maxlen, ))
embedding = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)

hs = LSTM(hid_dim, return_sequences=True,name='LSTM')(embedding)


key   = $$$
value= $$$

u = TimeDistributed(Dense(att_dim, activation='tanh'),name='T1')($$$)
score = TimeDistributed(Dense(1),name='T2')(u)
score_ = Lambda(lambda x: K.reshape(x, (K.shape(x)[0], maxlen)))(score)
alpha_=Activation('softmax')(score_)
alpha = Lambda(lambda x: K.expand_dims(x))(alpha_)

alphahs=Multiply(name='attention_mul')([$$$,$$$])

v = Lambda(lambda x: K.sum(x, axis=1))(alphahs)

out = Dense(6, activation="sigmoid")(v)
model = Model(inputs=inp, outputs=out)
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
model.summary()

In [None]:
batch_size = 32
epochs = 2

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=1)



In [None]:
# y_pred = model.predict(x_test, batch_size=1024)
# submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
# submission.to_csv('submission.csv', index=False)