In this kernel, I have implemented the encoder part of the transformer architecture as mentioned in the famous paper: Attention is all you need.(https://arxiv.org/abs/1706.03762).

Many of other codes are adopted from other kernels. For example, loading the embeddings,  load the training and test data and preprocessing, etc. I really appreciate their contributions.

p.s. When I run this locally, I get validation f1-score around 0.688.

Happy transforming!

## Imports

In [1]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, concatenate
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.layers import BatchNormalization, InputSpec, add
from keras.optimizers import Adam
from keras.models import Model, load_model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers, activations
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras.utils import Sequence

C:\Users\amakr\AppData\Local\Continuum\anaconda3\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
C:\Users\amakr\AppData\Local\Continuum\anaconda3\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
  stacklevel=1)


## Some pre-configurations

In [2]:
embed_size = 300 # how big is each word vector
max_features = 45000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 400 # max number of words in a question to use
n_heads = 4 # Number of heads as in Multi-head attention

In [3]:
# def load_and_prec():
train_df = pd.read_csv("../Translated/cleaned/train.csv")
test_df = pd.read_csv("../Translated/cleaned/test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y_train = lb.fit_transform(train_df['Label'])

y_train = pd.DataFrame(y_train, columns= lb.classes_)
train_df = pd.concat([train_df, y_train], axis = 1)
cols_target = train_df.Label.unique().tolist()

## split to train and val
# train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=0,shuffle = True) # hahaha
# train_X, val_X, train_y , val_y = train_test_split(train_df, train_df[cols_target], test_size=0.1, random_state = 0,stratify = train_df['Label'])

# trn_idx = train_y.index.tolist()
# val_idx = val_y.index.tolist()


## fill up the missing values
# train_X = train_X["Text"].fillna("_##_").values
# val_X = val_X["Text"].fillna("_##_").values
# test_X = test_df["Text"].fillna("_##_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_df.Text)
# train_X = tokenizer.texts_to_sequences(train_X)
# val_X = tokenizer.texts_to_sequences(val_X)
# test_X = tokenizer.texts_to_sequences(test_X)

# ## Pad the sentences 
# train_X = pad_sequences(train_X, maxlen=maxlen,padding = 'post', truncating = 'post')
# val_X = pad_sequences(val_X, maxlen=maxlen,padding = 'post', truncating = 'post')
# test_X = pad_sequences(test_X, maxlen=maxlen,padding = 'post', truncating = 'post')

# ## Get the target values
# train_y = train_y.values
# val_y = val_y.values  

word_index = tokenizer.word_index
#shuffling the data
# np.random.seed(2018)
# trn_idx = np.random.permutation(len(train_X))
# val_idx = np.random.permutation(len(val_X))

# train_X = train_X[trn_idx]
# val_X = val_X[val_idx]
# train_y = train_y[trn_idx]
# val_y = val_y[val_idx]    

#     return train_X, val_X, test_X, train_y, val_y, tokenizer.word_index,val_idx , trn_idx

Train shape :  (1436, 3)
Test shape :  (620, 2)


## Load Embeddings

In [4]:
def load_glove(word_index):
    EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.005838499,0.48782197
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in tqdm(word_index.items()):
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return embedding_matrix 
    
def load_fasttext(word_index):    
    EMBEDDING_FILE = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in tqdm(word_index.items()):
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector

    return embedding_matrix

def load_para(word_index):
    EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.0053247833,0.49346462
    embed_size = all_embs.shape[1]
    print(emb_mean,emb_std,"para")

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in tqdm(word_index.items()):
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    
    return embedding_matrix

## Scaled Dot-product attention

In [28]:
class DotProdSelfAttention(Layer):
    """The self-attention layer as in 'Attention is all you need'.
    paper reference: https://arxiv.org/abs/1706.03762
    
    """
    def __init__(self, units,
                 activation=None,
                 use_bias=False,
                 kernel_initializer='glorot_uniform',
                 bias_initializer='zeros',
                 kernel_regularizer=None,
                 bias_regularizer=None,
                 activity_regularizer=None,
                 kernel_constraint=None,
                 bias_constraint=None,
                 **kwargs):
        if 'input_shape' not in kwargs and 'input_dim' in kwargs:
            kwargs['input_shape'] = (kwargs.pop('input_dim'),)
        super(DotProdSelfAttention, self).__init__(*kwargs)
        self.units = units
        self.activation = activations.get(activation)
        self.use_bias = use_bias
        self.kernel_initializer = initializers.get(kernel_initializer)
        self.bias_initializer = initializers.get(bias_initializer)
        self.kernel_regularizer = regularizers.get(kernel_regularizer)
        self.bias_regularizer = regularizers.get(bias_regularizer)
        self.activity_regularizer = regularizers.get(activity_regularizer)
        self.kernel_constraint = constraints.get(kernel_constraint)
        self.bias_constraint = constraints.get(bias_constraint)
        self.input_spec = InputSpec(min_ndim=2)
        self.supports_masking = True
    def get_config(self):

        config = super(DotProdSelfAttention, self).get_config().copy()
        config.update({
            'units' : self.units, 
            'activation' : self.activation ,
            'use_bias' : self.use_bias,
            'kernel_initializer' : self.kernel_initializer,
            'bias_initializer' : self.bias_initializer,
            'kernel_regularizer' : self.kernel_regularizer,
            'bias_regularizer' : self.bias_regularizer,
            'activity_regularizer' : self.activity_regularizer,
            'kernel_constraint' : self.kernel_constraint ,
            'bias_constraint' : self.bias_constraint ,
            'input_spec' : self.input_spec ,
            'supports_masking' : self.supports_masking 
        })
        return config
    def build(self, input_shape):
        assert len(input_shape) == 3
        input_dim = input_shape[-1]
        # We assume the output-dim of Q, K, V are the same
        self.kernels = dict.fromkeys(['Q', 'K', 'V'])
        for key, _ in self.kernels.items():
            self.kernels[key] = self.add_weight(shape=(input_dim, self.units),
                                                initializer=self.kernel_initializer,
                                                name='kernel_{}'.format(key),
                                                regularizer=self.kernel_regularizer,
                                                constraint=self.kernel_constraint)
        if self.use_bias:
            raise NotImplementedError
        super(DotProdSelfAttention, self).build(input_shape)
        
    def call(self, x):
        Q = K.dot(x, self.kernels['Q'])
        K_mat = K.dot(x, self.kernels['K'])
        V = K.dot(x, self.kernels['V'])
        attention = K.batch_dot(Q, K.permute_dimensions(K_mat, [0, 2, 1]))
        d_k = K.constant(self.units, dtype=K.floatx())
        attention = attention / K.sqrt(d_k)
        attention = K.batch_dot(K.softmax(attention, axis=-1), V)
        return attention
    
    def compute_output_shape(self, input_shape):
        assert input_shape and len(input_shape) >= 2
        assert input_shape[-1]
        output_shape = list(input_shape)
        output_shape[-1] = self.units
        return tuple(output_shape)
      
        
    

## The Encoder Block

In [29]:
def encoder(input_tensor):
    """One encoder as in Attention Is All You Need
    """
    # Sub-layer 1
    # Multi-Head Attention
    multiheads = []
    d_v = embed_size // n_heads
    for i in range(n_heads):
        multiheads.append(DotProdSelfAttention(d_v)(input_tensor))
    multiheads = concatenate(multiheads, axis=-1)
    multiheads = Dense(embed_size)(multiheads)
    multiheads = Dropout(0.1)(multiheads)
    
    # Residual Connection
    res_con = add([input_tensor, multiheads])
    # Didn't use layer normalization, use Batch Normalization instead here
#     res_con = BatchNormalization(axis=-1)(res_con)
    
    # Sub-layer 2
    # 2 Feed forward layer
    ff1 = Dense(64, activation='relu')(res_con)
    ff2 = Dense(embed_size)(ff1)
    output = add([res_con, ff2])
#     output = BatchNormalization(axis=-1)(output)
    
    return output

## Positional Encoding

In [30]:
# https://github.com/kpot/keras-transformer/blob/master/keras_transformer/position.py
def positional_signal(hidden_size: int, length: int,
                      min_timescale: float = 1.0, max_timescale: float = 1e4):
    """
    Helper function, constructing basic positional encoding.
    The code is partially based on implementation from Tensor2Tensor library
    https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/layers/common_attention.py
    """

    if hidden_size % 2 != 0:
        raise ValueError(
            f"The hidden dimension of the model must be divisible by 2."
            f"Currently it is {hidden_size}")
    position = K.arange(0, length, dtype=K.floatx())
    num_timescales = hidden_size // 2
    log_timescale_increment = K.constant(
        (np.log(float(max_timescale) / float(min_timescale)) /
         (num_timescales - 1)),
        dtype=K.floatx())
    inv_timescales = (
            min_timescale *
            K.exp(K.arange(num_timescales, dtype=K.floatx()) *
                  -log_timescale_increment))
    scaled_time = K.expand_dims(position, 1) * K.expand_dims(inv_timescales, 0)
    signal = K.concatenate([K.sin(scaled_time), K.cos(scaled_time)], axis=1)
    return K.expand_dims(signal, axis=0)

In [31]:
# https://github.com/kpot/keras-transformer/blob/master/keras_transformer/position.py
class AddPositionalEncoding(Layer):
    """
    Injects positional encoding signal described in section 3.5 of the original
    paper "Attention is all you need". Also a base class for more complex
    coordinate encoding described in "Universal Transformers".
    """

    def __init__(self, min_timescale: float = 1.0,
                 max_timescale: float = 1.0e4, **kwargs):
        self.min_timescale = min_timescale
        self.max_timescale = max_timescale
        self.signal = None
        super().__init__(**kwargs)

    def get_config(self):
        config = super().get_config()
        config['min_timescale'] = self.min_timescale
        config['max_timescale'] = self.max_timescale
        return config

    def build(self, input_shape):
        _, length, hidden_size = input_shape
        self.signal = positional_signal(
            hidden_size, length, self.min_timescale, self.max_timescale)
        return super().build(input_shape)

    def call(self, inputs, **kwargs):
        return inputs + self.signal


## Transformer Encoder model

In [32]:
def model_transformer( n_encoder=3):
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, trainable=True)(inp)
    # Add positional encoding
    x = AddPositionalEncoding()(x)
    x = Dropout(0.1)(x)
    for i in range(n_encoder):
        x = encoder(x)
    # These are my own experiments
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
#     conc = Dense(512, activation="relu")(conc)
#     conc = Dropout(0.1)(conc)
    outp = Dense(20, activation="softmax")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='categorical_crossentropy', optimizer='adam',
                  metrics=['accuracy'])
    return model

In [33]:
LSTM

tensorflow.python.keras.layers.recurrent_v2.LSTM

In [34]:
# https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
class BaseDataGenerator(Sequence):
    """A data generator"""
    def __init__(self, list_IDs, batch_size=64, shuffle=True):
        self.list_IDs = list_IDs
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        """number of steps in one epoch"""
        # Here is the trick
        return len(self.list_IDs) // (self.batch_size * 2**2)

    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index*self.batch_size: (index+1)*self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)
        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' 
        X = train_X[list_IDs_temp, :]
        y = train_y[list_IDs_temp]
        return X, y

### Train and Predict

Here I used early stopping and model checkpoint to load the best_val model

In [35]:
# https://www.kaggle.com/strideradu/word2vec-and-gensim-go-go-go
def train_pred(n_encoder = 1, epochs=2):
    # learning schedule callback
#     loss_history = LossHistory()
#     lrate = BatchLRScheduler(step_decay)
#     callbacks_list = [loss_history, lrate]
#     es = EarlyStopping(monitor='val_loss', min_delta=0, patience=5)
#     model_path = 'keras_models.h5'
#     mc = ModelCheckpoint(filepath=model_path, monitor='val_loss', save_best_only=True)
#     callbacks = [es, mc]
#     train_generator = BaseDataGenerator(list(np.arange(train_X.shape[0])), batch_size=512)
#     model.fit_generator(train_generator,
#                         epochs=epochs,
#                         validation_data=(val_X, val_y),)
#                         callbacks=callbacks)
#     model = load_model(model_path)
    skf = StratifiedKFold(n_splits=7, random_state=0)
    models, preds, scores = [], [],[]
#     vectorizer = vect(max_df = 0.5)
    for train, test in skf.split(train_df.Text, train_df.Label):
#     print(train, test)
#     clf = LogisticRegression(penalty='l1')
#         clf.fit(vectorizer.transform(), data_train.Label.loc[data_train.index.intersection(train)])
#         K.clear_session()
#         clf = build_base_model()
        model = model_transformer(n_encoder=n_encoder)
        X_train = train_df.Text.loc[train_df.index.intersection(train)]
        X_val = train_df.Text.loc[train_df.index.intersection(test)]
        y_train = train_df[cols_target].loc[train_df.index.intersection(train)]
        y_val = train_df[cols_target].loc[train_df.index.intersection(test)]
        X_train = tokenizer.texts_to_sequences(X_train)
        X_val = tokenizer.texts_to_sequences(X_val)
        X_test = tokenizer.texts_to_sequences(test_df.Text)

        ## Pad the sentences 
        X_train = pad_sequences(X_train, maxlen=maxlen,padding = 'post', truncating = 'post')
        X_val = pad_sequences(X_val, maxlen=maxlen,padding = 'post', truncating = 'post')
        X_test = pad_sequences(X_test, maxlen=maxlen,padding = 'post', truncating = 'post')

#         X_test = vect.transform(test_df.Text).toarray()
        model.fit(X_train, y_train,
                    batch_size=32,
                    epochs=epochs,
                    verbose=1,
                   validation_data = (X_val,y_val),
                   callbacks=[
#               RocAucEvaluation(verbose=True),
              ModelCheckpoint(file_path,    monitor='val_accuracy', mode='max', save_best_only=True),
              EarlyStopping(patience=10,    monitor="val_accuracy", mode="max"),
              ReduceLROnPlateau(patience=4, monitor='val_accuracy', mode='max', cooldown=2, min_lr=1e-7, factor=0.3)])
        preds.append(model.predict(X_test))
        models.append(model)
        scores.append(model.evaluate(X_val,y_val))
#         coefs.append(clf.coef_[0])
#         clf.fit(X_train, y_train)
#     train_time = time() - t0
#     print("train time: %0.3fs" % train_time)

#     t0 = time()
#     pred = clf.predict(X_test)
#     test_time = time() - t0
#     print("test time:  %0.3fs" % test_time)
    pred = np.mean(preds,axis = 0)
#     model.fit(train_X, train_y, batch_size=64,
#               epochs=epochs,
#               validation_data=(val_X, val_y),)

#     pred_val_y = model.predict([val_X], batch_size=64, verbose=0)
#     pred_test_y = model.predict([test_X], batch_size=64, verbose=0)
    return models, preds, scores

### Main part: load, train, pred and blend 

In [36]:
# train_X, val_X, test_X, train_y, val_y, word_index,val_idx,trn_idx = load_and_prec()
vocab = []
for w,k in word_index.items():
    vocab.append(w)
    if k >= max_features:
        break
# embedding_matrix_1 = load_glove(word_index)
# embedding_matrix_2 = load_fasttext(word_index)
# embedding_matrix_3 = load_para(word_index)

### Create New Embeddings

In [37]:
## Simple average: http://aclweb.org/anthology/N18-2031

# We have presented an argument for averaging as
# a valid meta-embedding technique, and found experimental
# performance to be close to, or in some cases 
# better than that of concatenation, with the
# additional benefit of reduced dimensionality  


## Unweighted DME in https://arxiv.org/pdf/1804.07983.pdf

# “The downside of concatenating embeddings and 
#  giving that as input to an RNN encoder, however,
#  is that the network then quickly becomes inefficient
#  as we combine more and more embeddings.”
  
# embedding_matrix = np.mean([embedding_matrix_1, embedding_matrix_2, embedding_matrix_3], axis = 0)
# embedding_matrix = np.mean([embedding_matrix_1, embedding_matrix_3], axis = 0)
# np.shape(embedding_matrix)
# model.evaluate(val_X,val_y)

## Train and Predict

Here I am experimenting with 2 encoders, it's not guaranteed to be optimal, you can try out other numbers. Notice that I used epochs = 8

In [38]:
outputs = []
from sklearn.model_selection import KFold,StratifiedKFold,cross_val_score,train_test_split,StratifiedShuffleSplit
file_path = "weights_base.best.hdf5"
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler, Callback, ReduceLROnPlateau

# outputs[0][1]
# type(train_X[0][0])
# val_X
# train_y

In [39]:
# model.evaluate(train_X,train_y)
# train_X.shape
# train_idx

In [40]:
n_encoder = 1
models, preds, scores = train_pred(n_encoder = n_encoder,epochs = 42)
# outputs.append([pred_val_y, pred_test_y, 'transformer_enc{}'.format(n_encoder)])



Epoch 1/42
Epoch 2/42
Epoch 3/42
Epoch 4/42
Epoch 5/42
Epoch 6/42
Epoch 7/42
Epoch 8/42
Epoch 9/42
Epoch 10/42
Epoch 11/42
Epoch 12/42
Epoch 13/42
Epoch 14/42
Epoch 15/42
Epoch 16/42
Epoch 17/42
Epoch 18/42
Epoch 19/42
Epoch 20/42
Epoch 21/42
Epoch 22/42
Epoch 23/42
Epoch 24/42
Epoch 25/42
Epoch 26/42
Epoch 27/42
Epoch 28/42
Epoch 29/42
Epoch 30/42
Epoch 1/42
Epoch 2/42
Epoch 3/42
Epoch 4/42
Epoch 5/42
Epoch 6/42
Epoch 7/42
Epoch 8/42
Epoch 9/42
Epoch 10/42
Epoch 11/42
Epoch 12/42
Epoch 13/42
Epoch 14/42
Epoch 15/42
Epoch 16/42
Epoch 17/42
Epoch 18/42
Epoch 1/42
Epoch 2/42
Epoch 3/42
Epoch 4/42
Epoch 5/42
Epoch 6/42
Epoch 7/42
Epoch 8/42
Epoch 9/42
Epoch 10/42


Epoch 11/42
Epoch 12/42
Epoch 13/42
Epoch 14/42
Epoch 15/42
Epoch 16/42
Epoch 17/42
Epoch 18/42
Epoch 19/42
Epoch 20/42
Epoch 21/42
Epoch 22/42
Epoch 23/42
Epoch 24/42
Epoch 25/42
Epoch 26/42
Epoch 27/42
Epoch 28/42
Epoch 29/42
Epoch 30/42
Epoch 31/42
Epoch 32/42
Epoch 1/42
Epoch 2/42
Epoch 3/42
Epoch 4/42
Epoch 5/42
Epoch 6/42
Epoch 7/42
Epoch 8/42
Epoch 9/42
Epoch 10/42
Epoch 11/42
Epoch 12/42
Epoch 13/42
Epoch 14/42
Epoch 15/42
Epoch 16/42
Epoch 17/42
Epoch 18/42
Epoch 19/42
Epoch 20/42
Epoch 1/42
Epoch 2/42
Epoch 3/42
Epoch 4/42
Epoch 5/42
Epoch 6/42
Epoch 7/42
Epoch 8/42
Epoch 9/42
Epoch 10/42
Epoch 11/42
Epoch 12/42
Epoch 13/42
Epoch 14/42
Epoch 15/42


Epoch 16/42
Epoch 17/42
Epoch 18/42
Epoch 19/42
Epoch 20/42
Epoch 21/42
Epoch 22/42
Epoch 23/42
Epoch 24/42
Epoch 25/42
Epoch 26/42
Epoch 27/42
Epoch 28/42
Epoch 29/42
Epoch 30/42
Epoch 31/42
Epoch 32/42
Epoch 33/42
Epoch 1/42
Epoch 2/42
Epoch 3/42
Epoch 4/42
Epoch 5/42
Epoch 6/42
Epoch 7/42
Epoch 8/42
Epoch 9/42
Epoch 10/42
Epoch 11/42
Epoch 12/42
Epoch 13/42
Epoch 14/42
Epoch 15/42
Epoch 16/42
Epoch 17/42
Epoch 18/42
Epoch 19/42
Epoch 20/42
Epoch 21/42
Epoch 22/42
Epoch 23/42
Epoch 24/42
Epoch 25/42
Epoch 1/42
Epoch 2/42
Epoch 3/42
Epoch 4/42
Epoch 5/42
Epoch 6/42
Epoch 7/42
Epoch 8/42
Epoch 9/42
Epoch 10/42
Epoch 11/42
Epoch 12/42
Epoch 13/42
Epoch 14/42


Epoch 15/42
Epoch 16/42
Epoch 17/42
Epoch 18/42
Epoch 19/42
Epoch 20/42


In [41]:
# for thresh in np.arange(0.1, 0.51, 0.01):
#     thresh = np.round(thresh, 2)
#     print("F1 score at threshold {0:.2f} is {1}".format(thresh, metrics.f1_score(val_y, (pred_val_y>thresh).astype(int))))
models_trans = models
# scores
# preds
# np.mean(preds,axis = 0)


In [36]:
# pred_test_y = (pred_test_y > 0.42).astype(int)
# test_df = pd.read_csv("../input/test.csv", usecols=["qid"])
# out_df = pd.DataFrame({"qid":test_df["qid"].values})
# out_df['prediction'] = pred_test_y
# out_df.to_csv("submission.csv", index=False)

In [37]:
# idx = (pred_test_y > 0.42).astype(int)
# test_df = pd.read_csv("../input/test.csv", usecols=["qid"])
# out_df = pd.DataFrame({"qid":test_df["qid"].values})
# out_df['prediction'] = idx

In [38]:
# mylist = out_df[out_df.prediction == 1].index
# for i in mylist:
#     print(i, end=',')

# Stacking

# Stacking

In [42]:
# dir(model)
train_df = pd.read_csv("../Translated/cleaned/train.csv")
test_df = pd.read_csv("../Translated/cleaned/test.csv")
import re
def clean_text(text):
    text = text.lower()
#     text = re.sub(r"what's", "what is ", text)
#     text = re.sub(r"\'s", " ", text)
#     text = re.sub(r"\'ve", " have ", text)
#     text = re.sub(r"can't", "cannot ", text)
#     text = re.sub(r"n't", " not ", text)
#     text = re.sub(r"i'm", "i am ", text)
#     text = re.sub(r"\'re", " are ", text)
#     text = re.sub(r"\'d", " would ", text)
#     text = re.sub(r"\'ll", " will ", text)
#     text = re.sub(r"\'scuse", " excuse ", text)
#     text = re.sub('\W', ' ', text)
    text = re.sub(r",", " ", text) 
    text = re.sub(r"!", " ", text) 
    text = re.sub(r"\(", " ", text) 
    text = re.sub(r"\)", " ", text) 
    text = re.sub(r"\?", " ", text) 
    text = re.sub(r"\s{2,}", " ", text)  
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

# removing stop words
# other_stop_w = pd.read_csv('../Downloaded_notebooks/words_shared_by_all.csv')
# stopw = [item for sublist in other_stop_w.values.tolist() for item in sublist]
# train_df['Text'].apply(lambda x: [item for item in x.split() if item not in stopw])
# test_df['Text'].apply(lambda x: [item for item in x.split() if item not in stopw])

train_df['Text'] = train_df['Text'].map(lambda com : clean_text(com))
test_df['Text'] = test_df['Text'].map(lambda com : clean_text(com))
X_tfidf = train_df.Text
test_X_tfidf = test_df.Text




In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(max_features=45000,sublinear_tf=True, max_df=0.5, stop_words='english')

X_dtm = vect.fit_transform(X_tfidf).toarray()

test_X_dtm = vect.transform(test_X_tfidf).toarray()

In [44]:
# train_df

Unnamed: 0,ID,Text,Label
0,ID_AASHwXxg,mwangonde: khansala wachinyamata akamati achin...,POLITICS
1,ID_AGoFySzn,mcp siidakhutire ndi kalembera chipani cha mal...,POLITICS
2,ID_AGrrkBGP,bungwe la manepo lapempha boma liganizire anth...,HEALTH
3,ID_AIJeigeG,ndale zogawanitsa miyambo zanyanya si zachilen...,POLITICS
4,ID_APMprMbV,nanga wapolisi ataphofomoka masiku ano sichikh...,LAW/ORDER
...,...,...,...
1431,ID_zmTmmEio,eni minibus ati ali ndi ufulu wokweza mitengo ...,TRANSPORT
1432,ID_znOlIaGQ,kachali apepesa: kulankhula motumbwa kuthe ant...,POLITICS
1433,ID_znracTjN,mawu supports non-fiction writers the malawi w...,EDUCATION
1434,ID_ztdsmmva,tame mwawa: phwete ndiye kudya kwake sewero la...,SOCIAL ISSUES


In [45]:
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y_train = lb.fit_transform(train_df['Label'])

y_train = pd.DataFrame(y_train, columns= lb.classes_)
# # y_train
cols_target = train_df['Label'].unique().tolist()
train_df = pd.concat([train_df, y_train], axis = 1)
# # train_df

# x_train, x_val, y_train, y_val = train_test_split(X_dtm, train_df[cols_target], test_size=0.1, random_state = 0,stratify = train_df['Label'])

In [46]:
# train_df

In [47]:
# train_idx = list(set(X_tfidf.index.tolist()) - set(val_idx.tolist()))
# base_model.evaluate(X_dtm[val_idx],train_df.loc[val_idx,cols_target])
# (y_val == val_y).all()

In [48]:
# train_y.shape
# len(train_idx)

In [49]:
from tensorflow import keras
layers = keras.layers
models = keras.models
# Build the model
from keras import backend as K 

# Do some code, e.g. train and save model

# K.clear_session()
# seed_value= 0

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
# os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
# random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
# np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
# tf.random.set_seed(seed_value)
def build_base_model():
    K.clear_session()
    seed_value = 0
    tf.random.set_seed(seed_value)
    os.environ['PYTHONHASHSEED']=str(seed_value)
    random.seed(seed_value)
    np.random.seed(seed_value)
    base_model = models.Sequential()
    base_model.add(layers.Dense(1000, input_shape=(45000,)))
    # model.add(layers.BatchNormalization())
    base_model.add(layers.Activation('linear'))
    base_model.add(layers.Dropout(0.2))
    # model.add(layers.Dense(2048))
    # model.add(layers.BatchNormalization())
    # model.add(layers.Activation('relu'))
    # model.add(layers.Dense(512))
    # # model.add(layers.BatchNormalization())
    # model.add(layers.Activation('relu'))
    # model.add(layers.Dense(128))
    # # model.add(layers.BatchNormalization())
    # model.add(layers.Activation('relu'))

    # model.add(layers.Dropout(drop_ratio))
    base_model.add(layers.Dense(20))
    base_model.add(layers.Activation('softmax'))

    base_model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return base_model

In [50]:
# X_dtm[train_idx].shape
# base_model
# history = base_model.fit(x_train, y_train,
#                     batch_size=64,
#                     epochs=10,
#                     verbose=1,
#                    validation_split = 0.1)
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler, Callback, ReduceLROnPlateau
file_path = "weights_base.best.hdf5"
def benchmark():
    print('_' * 80)
    print("Training: ")
#     print(clf)
#     t0 = time()
    skf = StratifiedKFold(n_splits=5, random_state=0)
    models, preds, scores = [], [],[]
#     vectorizer = vect(max_df = 0.5)
    for train, test in skf.split(train_df.Text, train_df.Label):
#     print(train, test)
#     clf = LogisticRegression(penalty='l1')
#         clf.fit(vectorizer.transform(), data_train.Label.loc[data_train.index.intersection(train)])
#         K.clear_session()
        clf = build_base_model()
        X_train = train_df.Text.loc[train_df.index.intersection(train)]
        X_val = train_df.Text.loc[train_df.index.intersection(test)]
        y_train = train_df[cols_target].loc[train_df.index.intersection(train)]
        y_val = train_df[cols_target].loc[train_df.index.intersection(test)]
        X_train = vect.transform(X_train).toarray()
        X_val = vect.transform(X_val).toarray()
        X_test = vect.transform(test_df.Text).toarray()
        clf.fit(X_train, y_train,
                    batch_size=32,
                    epochs=10,
                    verbose=1,
                   validation_data = (X_val,y_val),
                   callbacks=[
#               RocAucEvaluation(verbose=True),
              ModelCheckpoint(file_path,    monitor='val_accuracy', mode='max', save_best_only=True),
              EarlyStopping(patience=10,    monitor="val_accuracy", mode="max"),
              ReduceLROnPlateau(patience=4, monitor='val_accuracy', mode='max', cooldown=2, min_lr=1e-7, factor=0.3)])
        preds.append(clf.predict(X_test))
        models.append(clf)
        scores.append(clf.evaluate(X_val,y_val))
#         coefs.append(clf.coef_[0])
#         clf.fit(X_train, y_train)
#     train_time = time() - t0
#     print("train time: %0.3fs" % train_time)

#     t0 = time()
#     pred = clf.predict(X_test)
#     test_time = time() - t0
#     print("test time:  %0.3fs" % test_time)
    pred = np.mean(preds,axis = 0)
#     score = metrics.accuracy_score(data_test.Label, pred)
#     print("accuracy:   %0.3f" % score)
    return models, pred,scores


In [51]:
# base_model.evaluate(x_val,y_val)
# train_X.shape
# (y_train == train_y).all()
from __future__ import print_function

import logging
import numpy as np
from optparse import OptionParser
import sys
from time import time
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold,StratifiedKFold,cross_val_score,train_test_split,StratifiedShuffleSplit

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron,LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn.decomposition import PCA
from sklearn import metrics
import pandas as pd
from sklearn.metrics import accuracy_score
base_models, pred,scores = benchmark()

________________________________________________________________________________
Training: 




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


ResourceExhaustedError: OOM when allocating tensor with shape[45000,1000] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu [Op:RandomUniform]

In [61]:
# scores

[[1.335684061050415, 0.6145833134651184],
 [1.2624201774597168, 0.6376306414604187],
 [1.1637887954711914, 0.6689895391464233],
 [1.1812516450881958, 0.6236934065818787],
 [1.3277735710144043, 0.6097561120986938]]

In [58]:
# base_model.trainable = False
# model.trainable = False
base_models

[<tensorflow.python.keras.engine.sequential.Sequential at 0x1dd41b4b608>,
 <tensorflow.python.keras.engine.sequential.Sequential at 0x1dd41d2e608>,
 <tensorflow.python.keras.engine.sequential.Sequential at 0x1dd423f9388>,
 <tensorflow.python.keras.engine.sequential.Sequential at 0x1dd441a9508>,
 <tensorflow.python.keras.engine.sequential.Sequential at 0x1dd479cafc8>,
 <tensorflow.python.keras.engine.sequential.Sequential at 0x1dd49cc5148>,
 <tensorflow.python.keras.engine.sequential.Sequential at 0x1dd39ecad48>]

In [93]:
# x_train.shape
# X_dtm
# lb.inverse_transform(pred)
# lb.inverse_transform(pd.DataFrame(pred,columns = cols_target)[lb.classes_].values)
pred = np.mean(preds,axis = 0)

test_df['Label']= lb.inverse_transform(pd.DataFrame(pred,columns = cols_target)[lb.classes_].values)
sub = test_df[['ID', 'Label']]
sub.to_csv('cross_enc_001.csv', index = False)
sub

Unnamed: 0,ID,Label
0,ID_ADHEtjTi,SOCIAL ISSUES
1,ID_AHfJktdQ,RELIGION
2,ID_AUJIHpZr,SOCIAL ISSUES
3,ID_AUKYBbIM,SOCIAL ISSUES
4,ID_AZnsVPEi,SOCIAL ISSUES
...,...,...
615,ID_zdpOUWyJ,SOCIAL
616,ID_zhnOomuu,RELATIONSHIPS
617,ID_zmWHvBJb,LAW/ORDER
618,ID_zphjdFIb,SOCIAL ISSUES


In [None]:
# train_df = pd.read_csv("../Translated/cleaned/train.csv")
# test_df = pd.read_csv("../Translated/cleaned/test.csv")

K.clear_session()
seed_value= 0

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.random.set_seed(seed_value)
from tensorflow.keras import regularizers

def build_supermodel():
    K.clear_session()

    os.environ['PYTHONHASHSEED']=str(seed_value)

    random.seed(seed_value)

    np.random.seed(seed_value)

    tf.random.set_seed(seed_value)

    input_trans = layers.Input(shape=(maxlen,))
    input_tf = layers.Input(shape=(45000,))
    output1 = []
    for i, base_model in enumerate(base_models) : 
        base_model._name = 'base_model_'+str(i)
        output1.append(base_model)
    output_1 = [base_model(input_tf,training = False) for base_model in output1]

    output_2 = [model(input_trans,training = False) for model in models_trans]

    y = layers.Concatenate( name = 'output_1')(output_1)
    x = layers.Concatenate()(output_2)
    x = layers.Concatenate()([x,y])
    # x = layers.Dense(1024, activation = 'linear')(x)
    x = layers.Dense(512, activation = 'linear')(x)
    # x = BatchNormalization()(x)
    x = layers.Dense(256, activation = 'sigmoid')(x)
    x = layers.Dense(128, activation = 'linear',kernel_regularizer=regularizers.l2(0.05))(x)
    # x = BatchNormalization()(x)
    outputs = layers.Dense(20, activation="softmax")(x)
    super_model = keras.Model(inputs=[input_trans, input_tf], outputs=outputs)
    super_model.compile("adam", "categorical_crossentropy", metrics=["accuracy"])
    return super_model
# history = super_model.fit(
#     [train_X,x_train], y_train, batch_size=32, epochs=19, validation_split = 0.1
# )

In [None]:
def benchmark_stack():
    print('_' * 80)
    print("Training: ")
#     print(clf)
#     t0 = time()
    skf = StratifiedKFold(n_splits=5, random_state=0)
    models, preds, scores = [], [],[]
#     vectorizer = vect(max_df = 0.5)
    for train, test in skf.split(train_df.Text, train_df.Label):
#     print(train, test)
#     clf = LogisticRegression(penalty='l1')
#         clf.fit(vectorizer.transform(), data_train.Label.loc[data_train.index.intersection(train)])
#         K.clear_session()
        clf = build_supermodel()
        X_train = train_df.Text.loc[train_df.index.intersection(train)]
        X_val = train_df.Text.loc[train_df.index.intersection(test)]
        y_train = train_df[cols_target].loc[train_df.index.intersection(train)]
        y_val = train_df[cols_target].loc[train_df.index.intersection(test)]
        
        X_train_ker = tokenizer.texts_to_sequences(X_train)
        X_val_ker = tokenizer.texts_to_sequences(X_val)
        X_test_ker = tokenizer.texts_to_sequences(test_df.Text)

        ## Pad the sentences 
        X_train_ker = pad_sequences(X_train_ker, maxlen=maxlen,padding = 'post', truncating = 'post')
        X_val_ker = pad_sequences(X_val_ker, maxlen=maxlen,padding = 'post', truncating = 'post')
        X_test_ker = pad_sequences(X_test_ker, maxlen=maxlen,padding = 'post', truncating = 'post')
        
        X_train_tfidf = vect.transform(X_train).toarray()
        X_val_tfidf = vect.transform(X_val).toarray()
        X_test_tfidf = vect.transform(test_df.Text).toarray()
        
        
        clf.fit([X_train_ker, X_train_tfidf], y_train,
                    batch_size=32,
                    epochs=3,
                    verbose=1,
                   validation_data = ([X_val_ker, X_val_tfidf],y_val),
                   callbacks=[
#               RocAucEvaluation(verbose=True),
              ModelCheckpoint(file_path,    monitor='val_accuracy', mode='max', save_best_only=True),
              EarlyStopping(patience=10,    monitor="val_accuracy", mode="max"),
              ReduceLROnPlateau(patience=4, monitor='val_accuracy', mode='max', cooldown=2, min_lr=1e-7, factor=0.3)])
        preds.append(clf.predict([X_test_ker,X_test_tfidf]))
        models.append(clf)
        scores.append(clf.evaluate([X_val_ker, X_val_tfidf],y_val))
#         coefs.append(clf.coef_[0])
#         clf.fit(X_train, y_train)
#     train_time = time() - t0
#     print("train time: %0.3fs" % train_time)

#     t0 = time()
#     pred = clf.predict(X_test)
#     test_time = time() - t0
#     print("test time:  %0.3fs" % test_time)
    pred = np.mean(preds,axis = 0)
#     score = metrics.accuracy_score(data_test.Label, pred)
#     print("accuracy:   %0.3f" % score)
    return models, pred,scores


In [None]:
models, pred,scores = benchmark_stack()

In [79]:
# super_model.evaluate([val_X,x_val], y_val)




[1.8373671770095825, 0.6805555820465088]

In [80]:
# (y_val.values == val_y).all()
# y_val.columns == lb.classes_
preds  = super_model.predict([test_X, test_X_dtm])
# lb.inverse_transform(pd.DataFrame(preds,columns = cols_target)[lb.classes_].values)
test_df['Label'] = lb.inverse_transform(pd.DataFrame(preds,columns = cols_target)[lb.classes_].values)

In [81]:
# test_X.shape
sub = test_df[['ID','Label']]
sub.to_csv('submission_keras_stack003.csv', index = False)

In [86]:
(sub.Label == pd.read_csv('submission_keras_stack002.csv').Label).sum()

612

In [None]:
from sklearn.linear
from sklearn.model_selection import StratifiedKFold
