In [1]:
import gc
import os
import nltk
import tqdm
import numpy as np
import pandas as pd
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
def tokenize_sentences(sentences, words_dict):
    tokenized_sentences = []
    for sentence in tqdm.tqdm(sentences):
        if hasattr(sentence, "decode"):
            sentence = sentence.decode("utf-8")
        tokens = nltk.tokenize.word_tokenize(sentence)
        result = []
        for word in tokens:
            word = word.lower()
            if word not in words_dict:
                words_dict[word] = len(words_dict)
            word_index = words_dict[word]
            result.append(word_index)
        tokenized_sentences.append(result)
    return tokenized_sentences, words_dict

In [3]:
def read_embedding_list(file_path):
    embedding_word_dict = {}
    embedding_list = []
    f = open(file_path,"r",encoding="utf-8")

    for index, line in enumerate(f):
        if index == 0:
            continue
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
        except:
            continue
        embedding_list.append(coefs)
        embedding_word_dict[word] = len(embedding_word_dict)
    f.close()
    embedding_list = np.array(embedding_list)
    return embedding_list, embedding_word_dict
def clear_embedding_list(embedding_list, embedding_word_dict, words_dict):
    cleared_embedding_list = []
    cleared_embedding_word_dict = {}

    for word in words_dict:
        if word not in embedding_word_dict:
            continue
        word_id = embedding_word_dict[word]
        row = embedding_list[word_id]
        cleared_embedding_list.append(row)
        cleared_embedding_word_dict[word] = len(cleared_embedding_word_dict)

    return cleared_embedding_list, cleared_embedding_word_dict
def convert_tokens_to_ids(tokenized_sentences, words_list, embedding_word_dict, sentences_length):
    words_train = []

    for sentence in tokenized_sentences:
        current_words = []
        for word_index in sentence:
            word = words_list[word_index]
            word_id = embedding_word_dict.get(word, len(embedding_word_dict) - 2)
            current_words.append(word_id)

        if len(current_words) >= sentences_length:
            current_words = current_words[:sentences_length]
        else:
            current_words += [len(embedding_word_dict) - 1] * (sentences_length - len(current_words))
        words_train.append(current_words)
        print (words_train)
    return words_train

In [4]:
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras.engine import Layer
from keras.layers import Activation, Add, Bidirectional, Conv1D, Dense, Dropout, Embedding, Flatten
from keras.layers import concatenate, GRU, Input, K, LSTM, MaxPooling1D
from keras.layers import GlobalAveragePooling1D,  GlobalMaxPooling1D, SpatialDropout1D
from keras.models import Model
from keras.optimizers import Adam
from keras.preprocessing import text, sequence
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
from sklearn.model_selection import train_test_split
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks

Using TensorFlow backend.


In [5]:
gru_len = 16 ############### changed from 128
Routings = 5
Num_capsule = 10
Dim_capsule = 16
dropout_p = 0.3
rate_drop_dense = 0.3

In [6]:
def squash(x, axis=-1):
    s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
    scale = K.sqrt(s_squared_norm + K.epsilon())
    return x / scale

In [7]:
class Capsule(Layer):
    def __init__(self, num_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True,
                 activation='default', **kwargs):
        super(Capsule, self).__init__(**kwargs)
        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = squash
        else:
            self.activation = Activation(activation)

    def build(self, input_shape):
        super(Capsule, self).build(input_shape)
        input_dim_capsule = input_shape[-1]
        if self.share_weights:
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(1, input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     # shape=self.kernel_size,
                                     initializer='glorot_uniform',
                                     trainable=True)
        else:
            input_num_capsule = input_shape[-2]
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(input_num_capsule,
                                            input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     initializer='glorot_uniform',
                                     trainable=True)

    def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
            if i < self.routings - 1:
                b = K.batch_dot(outputs, u_hat_vecs, [2, 3])

        return outputs

    def compute_output_shape(self, input_shape):
        return (None, self.num_capsule, self.dim_capsule)

In [8]:
def get_model(embedding_matrix, sequence_length, dropout_rate, recurrent_units, dense_size):
    input1 = Input(shape=(sequence_length,))
    embed_layer = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1],
                                weights=[embedding_matrix], trainable=False)(input1)
    embed_layer = SpatialDropout1D(rate_drop_dense)(embed_layer)

    x = Bidirectional(
        GRU(gru_len, activation='relu', dropout=dropout_p, recurrent_dropout=dropout_p, return_sequences=True))(
        embed_layer)
    capsule = Capsule(num_capsule=Num_capsule, dim_capsule=Dim_capsule, routings=Routings,
                      share_weights=True)(x)
    capsule = Flatten()(capsule)
    capsule = Dropout(dropout_p)(capsule)
    output = Dense(1, activation='sigmoid')(capsule)
    model = Model(inputs=input1, outputs=output)
    model.compile(
        loss='binary_crossentropy',
        optimizer='adam',
        metrics=['accuracy'])
    return model

In [9]:
def _train_model(model, batch_size, train_x, train_y, val_x, val_y):
    num_labels = train_y.shape[1]
    patience = 5
    best_loss = -1
    best_weights = None
    best_epoch = 0
    
    current_epoch = 0
    
    while True:
        model.fit(train_x, train_y, batch_size=batch_size, epochs=1)
        y_pred = model.predict(val_x, batch_size=batch_size)

        total_loss = 0
        for j in range(num_labels):
            loss = log_loss(val_y[:, j], y_pred[:, j])
            total_loss += loss

        total_loss /= num_labels

        print("Epoch {0} loss {1} best_loss {2}".format(current_epoch, total_loss, best_loss))

        current_epoch += 1
        if total_loss < best_loss or best_loss == -1:
            best_loss = total_loss
            best_weights = model.get_weights()
            best_epoch = current_epoch
        else:
            if current_epoch - best_epoch == patience:
                break

    model.set_weights(best_weights)
    return model

In [10]:
def train_folds(X, y, X_test, fold_count, batch_size, get_model_func):
    print("="*75)
    fold_size = len(X) // fold_count
    models = []
    result_path = "binaries/predictions"
    if not os.path.exists(result_path):
        os.mkdir(result_path)
    for fold_id in range(0, fold_count):
        fold_start = fold_size * fold_id
        fold_end = fold_start + fold_size

        if fold_id == fold_size - 1:
            fold_end = len(X)

        train_x = np.concatenate([X[:fold_start], X[fold_end:]])
        train_y = np.concatenate([y[:fold_start], y[fold_end:]])

        val_x = np.array(X[fold_start:fold_end])
        val_y = np.array(y[fold_start:fold_end])

        model = _train_model(get_model_func(), batch_size, train_x, train_y, val_x, val_y)
        train_predicts_path = os.path.join(result_path, "train_predicts{0}.npy".format(fold_id))
        test_predicts_path = os.path.join(result_path, "test_predicts{0}.npy".format(fold_id))
        train_predicts = model.predict(X, batch_size=512, verbose=1)
        test_predicts = model.predict(X_test, batch_size=512, verbose=1)
        np.save(train_predicts_path, train_predicts)
        np.save(test_predicts_path, test_predicts)

    return models

In [11]:
# train_file_path = "../input/donorschooseorg-preprocessed-data/train_preprocessed.csv"
train_file_path = "binaries/train_small.csv"

# test_file_path = "../input/donorschooseorg-preprocessed-data/test_preprocessed.csv"
test_file_path = "binaries/test_small.csv"

# embedding_path = "../input/fatsttext-common-crawl/crawl-300d-2M/crawl-300d-2M.vec"
embedding_path = "binaries/embeddings_small.vec"

batch_size = 128 # 256
recurrent_units = 16 # 64
dropout_rate = 0.3 
dense_size = 8 # 32
sentences_length = 10 # 300
fold_count = 2 # 10

In [12]:
UNKNOWN_WORD = "_UNK_"
END_WORD = "_END_"
NAN_WORD = "_NAN_"
CLASSES = ["project_is_approved"]
# Load data
print("Loading data...")
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)
list_sentences_train = train_data["application_text"].fillna(NAN_WORD).values
list_sentences_test = test_data["application_text"].fillna(NAN_WORD).values
y_train = train_data[CLASSES].values
print("Tokenizing sentences in train set...")
tokenized_sentences_train, words_dict = tokenize_sentences(list_sentences_train, {})
print("Tokenizing sentences in test set...")
tokenized_sentences_test, words_dict = tokenize_sentences(list_sentences_test, words_dict)
# Embedding
words_dict[UNKNOWN_WORD] = len(words_dict)
print("Loading embeddings...")
embedding_list, embedding_word_dict = read_embedding_list(embedding_path)
embedding_size = len(embedding_list[0])
print("Preparing data...")
embedding_list, embedding_word_dict = clear_embedding_list(embedding_list, embedding_word_dict, words_dict)

embedding_word_dict[UNKNOWN_WORD] = len(embedding_word_dict)
embedding_list.append([0.] * embedding_size)
embedding_word_dict[END_WORD] = len(embedding_word_dict)
embedding_list.append([-1.] * embedding_size)

embedding_matrix = np.array(embedding_list)

id_to_word = dict((id, word) for word, id in words_dict.items())
train_list_of_token_ids = convert_tokens_to_ids(
    tokenized_sentences_train,
    id_to_word,
    embedding_word_dict,
    sentences_length)
test_list_of_token_ids = convert_tokens_to_ids(
    tokenized_sentences_test,
    id_to_word,
    embedding_word_dict,
    sentences_length)
X_train = np.array(train_list_of_token_ids)
X_test = np.array(test_list_of_token_ids)
get_model_func = lambda: get_model(
    embedding_matrix,
    sentences_length,
    dropout_rate,
    recurrent_units,
    dense_size)

 16%|█▌        | 16/99 [00:00<00:00, 157.20it/s]

Loading data...
Tokenizing sentences in train set...


100%|██████████| 99/99 [00:00<00:00, 200.76it/s]
 17%|█▋        | 17/99 [00:00<00:00, 166.61it/s]

Tokenizing sentences in test set...


100%|██████████| 99/99 [00:00<00:00, 166.34it/s]


Loading embeddings...
Preparing data...
[[77, 77, 77, 77, 77, 22, 66, 77, 77, 77]]
[[77, 77, 77, 77, 77, 22, 66, 77, 77, 77], [77, 77, 4, 77, 35, 41, 77, 77, 70, 1]]
[[77, 77, 77, 77, 77, 22, 66, 77, 77, 77], [77, 77, 4, 77, 35, 41, 77, 77, 70, 1], [77, 77, 16, 77, 77, 77, 77, 77, 70, 77]]
[[77, 77, 77, 77, 77, 22, 66, 77, 77, 77], [77, 77, 4, 77, 35, 41, 77, 77, 70, 1], [77, 77, 16, 77, 77, 77, 77, 77, 70, 77], [77, 77, 77, 16, 77, 77, 4, 77, 77, 77]]
[[77, 77, 77, 77, 77, 22, 66, 77, 77, 77], [77, 77, 4, 77, 35, 41, 77, 77, 70, 1], [77, 77, 16, 77, 77, 77, 77, 77, 70, 77], [77, 77, 77, 16, 77, 77, 4, 77, 77, 77], [3, 77, 77, 77, 0, 41, 77, 77, 77, 66]]
[[77, 77, 77, 77, 77, 22, 66, 77, 77, 77], [77, 77, 4, 77, 35, 41, 77, 77, 70, 1], [77, 77, 16, 77, 77, 77, 77, 77, 70, 77], [77, 77, 77, 16, 77, 77, 4, 77, 77, 77], [3, 77, 77, 77, 0, 41, 77, 77, 77, 66], [77, 16, 77, 41, 77, 77, 66, 77, 77, 71]]
[[77, 77, 77, 77, 77, 22, 66, 77, 77, 77], [77, 77, 4, 77, 35, 41, 77, 77, 70, 1], [77, 7

[[77, 77, 77, 77, 77, 22, 66, 77, 77, 77], [77, 77, 4, 77, 35, 41, 77, 77, 70, 1], [77, 77, 16, 77, 77, 77, 77, 77, 70, 77], [77, 77, 77, 16, 77, 77, 4, 77, 77, 77], [3, 77, 77, 77, 0, 41, 77, 77, 77, 66], [77, 16, 77, 41, 77, 77, 66, 77, 77, 71], [77, 77, 77, 70, 31, 15, 77, 77, 66, 77], [77, 77, 77, 16, 77, 77, 77, 19, 77, 7], [77, 77, 77, 77, 77, 77, 66, 77, 77, 77], [77, 28, 63, 1, 77, 77, 77, 77, 77, 77], [77, 77, 77, 66, 77, 21, 77, 34, 2, 3], [77, 77, 66, 77, 77, 16, 77, 77, 63, 31], [77, 77, 77, 77, 66, 77, 70, 77, 34, 1], [77, 77, 77, 42, 58, 2, 3, 77, 16, 77], [77, 77, 31, 77, 48, 41, 77, 77, 18, 1], [1, 77, 77, 66, 77, 77, 19, 77, 16, 77], [77, 0, 77, 77, 66, 77, 19, 77, 77, 77], [77, 77, 77, 22, 31, 77, 77, 70, 68, 77], [77, 77, 77, 77, 41, 77, 77, 77, 19, 48], [77, 63, 77, 1, 77, 77, 77, 16, 77, 31], [1, 77, 77, 77, 77, 77, 51, 1, 77, 34], [77, 77, 77, 77, 31, 27, 22, 77, 77, 77], [77, 77, 77, 34, 31, 77, 77, 77, 77, 77], [77, 77, 63, 77, 77, 72, 44, 77, 77, 77], [77, 77, 

In [13]:
del train_data, test_data, list_sentences_train, list_sentences_test
del tokenized_sentences_train, tokenized_sentences_test, words_dict
del embedding_list, embedding_word_dict
del train_list_of_token_ids, test_list_of_token_ids
gc.collect();
print("Starting to train models...")
models = train_folds(X_train, y_train, X_test, fold_count, batch_size, get_model_func)

Starting to train models...
Epoch 1/1
Epoch 0 loss 0.6899410936297202 best_loss -1
Epoch 1/1
Epoch 1 loss 0.6066481975876555 best_loss 0.6899410936297202
Epoch 1/1
Epoch 2 loss 0.5414380607556324 best_loss 0.6066481975876555
Epoch 1/1
Epoch 3 loss 0.4933731543774508 best_loss 0.5414380607556324
Epoch 1/1
Epoch 4 loss 0.459547401082759 best_loss 0.4933731543774508
Epoch 1/1
Epoch 5 loss 0.437181222195528 best_loss 0.459547401082759
Epoch 1/1
Epoch 6 loss 0.42309822324587376 best_loss 0.437181222195528
Epoch 1/1
Epoch 7 loss 0.4151966565725755 best_loss 0.42309822324587376
Epoch 1/1
Epoch 8 loss 0.4112134277820587 best_loss 0.4151966565725755
Epoch 1/1
Epoch 9 loss 0.40984491213243834 best_loss 0.4112134277820587
Epoch 1/1
Epoch 10 loss 0.4100162263731567 best_loss 0.40984491213243834
Epoch 1/1
Epoch 11 loss 0.41101059758541536 best_loss 0.40984491213243834
Epoch 1/1
Epoch 12 loss 0.41247052775353804 best_loss 0.40984491213243834
Epoch 1/1
Epoch 13 loss 0.41382661537856474 best_loss 0.40

In [21]:
from scipy.stats import rankdata

LABELS = ["project_is_approved"]

base = "binaries/"
predict_list = []
for j in range(2):
    predict_list.append(np.load(base + "/predictions/test_predicts%d.npy"%j))
    
print("Rank averaging on ", len(predict_list), " files")
predictions = np.zeros_like(predict_list[0])
for predict in predict_list:
    predcitions = np.add(predcitions.flatten(), rankdata(predict)/predictions.shape[0]) 
    print(np.add(predictions.flatten(), rankdata(predict)/predcitions.shape[0]))
predictions /= len(predict_list)
submission = pd.read_csv('binaries/sample_submission.csv')

print(submission)
print(len(predict_list))

# submission[LABELS] = pd.Series(predictions) ##### Commented Out for getting labels

submission.to_csv('submission.csv', index=False)

Rank averaging on  2  files
[0.22222222 0.46464646 0.47474747 0.4040404  0.32323232 0.62626263
 0.50505051 0.90909091 0.56565657 0.03030303 0.82828283 0.67676768
 0.37373737 0.65656566 0.12121212 0.77777778 0.87878788 0.42424242
 0.45454545 0.17171717 0.04040404 0.1010101  0.34343434 0.58585859
 0.27272727 0.16161616 0.15151515 0.70707071 0.09090909 0.51515152
 0.28282828 0.25252525 0.06060606 0.2020202  0.68686869 0.33333333
 0.8989899  0.57575758 0.63636364 0.7979798  0.80808081 0.36363636
 0.24242424 0.05050505 0.43434343 0.29292929 0.86868687 0.21212121
 0.6969697  0.53535354 0.52525253 0.85858586 0.98989899 0.5959596
 0.93434343 0.55555556 0.81818182 0.23232323 0.08080808 0.78787879
 0.13131313 0.54545455 0.49494949 0.97979798 0.38383838 0.35353535
 0.60606061 0.02020202 0.01010101 0.07070707 1.         0.95959596
 0.44444444 0.96969697 0.31313131 0.83838384 0.3030303  0.48484848
 0.64646465 0.94949495 0.14141414 0.71717172 0.73737374 0.72727273
 0.11111111 0.88888889 0.75757576 0

Exception: Data must be 1-dimensional