In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd
import re
import os
os.environ['OMP_NUM_THREADS'] = '4'
from tqdm import tqdm

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score

from keras import regularizers
from keras.models import Model
from keras.layers import Dropout, Input, Dense, Embedding, SpatialDropout1D, concatenate, PReLU
from keras.layers import CuDNNGRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.optimizers import Nadam, Adam
from keras.preprocessing import text, sequence
from keras.callbacks import Callback, TensorBoard, EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras.models import load_model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras import backend as K
from keras.engine import InputSpec, Layer
import tensorflow as tf
from global_variables import TRAIN_FILENAME, TEST_FILENAME, SAMPLE_SUBMISSION_FILENAME
from preprocess_utils import preprocess
import warnings
warnings.filterwarnings('ignore')

max_features = 150000
maxlen = 200
em = 'ft'
embed_size = 0
if em == 'glove':
    embed_size = 200
else:
    embed_size = 300
#fix4->fix5:去掉concatenate后的Dropout层，SpatialDropout1D百分比0.45->0.5
#fix5->fix6:modelcheckpoint
#fix6->fix7:preprocess, gru后+prelu和dropout etc.
#fix7->fix8:LR decrese, glove twitter200d
#fix8->fix9:10fold
#fix9->fix11:dropout0.5->0.2, attention layer

def glove_preprocess(text):
    #adapted from https://nlp.stanford.edu/projects/glove/preprocess-twitter.rb
    # Different regex parts for smiley faces
    eyes = "[8:=;]"
    nose = "['`\-]?"
    text = re.sub("https?:* ", "<URL>", text)
    text = re.sub("www.* ", "<URL>", text)
    text = re.sub("\[\[User(.*)\|", '<USER>', text)
    text = re.sub("<3", '<HEART>', text)
    text = re.sub("[-+]?[.\d]*[\d]+[:,.\d]*", "<NUMBER>", text)
    text = re.sub(eyes + nose + "[Dd)]", '<SMILE>', text)
    text = re.sub("[(d]" + nose + eyes, '<SMILE>', text)
    text = re.sub(eyes + nose + "p", '<LOLFACE>', text)
    text = re.sub(eyes + nose + "\(", '<SADFACE>', text)
    text = re.sub("\)" + nose + eyes, '<SADFACE>', text)
    text = re.sub(eyes + nose + "[/|l*]", '<NEUTRALFACE>', text)
    text = re.sub("/", " / ", text)
    text = re.sub("[-+]?[.\d]*[\d]+[:,.\d]*", "<NUMBER>", text)
    text = re.sub("([!]){2,}", "! <REPEAT>", text)
    text = re.sub("([?]){2,}", "? <REPEAT>", text)
    text = re.sub("([.]){2,}", ". <REPEAT>", text)
    pattern = re.compile(r"(.)\1{2,}")
    text = pattern.sub(r"\1" + " <ELONG>", text)
    return text

def deduplicate(x, threshold):
    word_list = x.split()
    num_words = len(word_list)
    if num_words == 0:
        return x
    else:
        num_unique_words = len(set(word_list))
        unique_ratio = num_words/num_unique_words
        if unique_ratio > threshold:
            x = ' '.join(x.split()[:num_unique_words])
        return x

train = pd.read_csv(TRAIN_FILENAME)
test = pd.read_csv(TEST_FILENAME)
submission = pd.read_csv(SAMPLE_SUBMISSION_FILENAME)

if em == 'glove':
    train['comment_text'] = train['comment_text'].apply(lambda x: glove_preprocess(x))
    test['comment_text'] = test['comment_text'].apply(lambda x: glove_preprocess(x))

train = preprocess(train)
train = preprocess(train)

X = train["comment_text"].fillna(" ").values
Y = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna(" ").values

tokenizer_w = text.Tokenizer(num_words=max_features)
tokenizer_w.fit_on_texts(texts=list(X) + list(X_test))

X = tokenizer_w.texts_to_sequences(X)
X_test = tokenizer_w.texts_to_sequences(X_test)

X = sequence.pad_sequences(X, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)

def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

EMBEDDING_FILE = ''
# EMBEDDING_FILE = '../embed/crawl-300d-2M.vec'
if em == 'glove':
    EMBEDDING_FILE = 'assets/embedding_models/glove/glove.twitter.27B.200d.txt'
else:
    EMBEDDING_FILE = 'assets/embedding_models/ft_300d_crawl/crawl-300d-2M.vec'
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in tqdm(open(EMBEDDING_FILE, encoding='UTF-8')))

word_index = tokenizer_w.word_index 
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, idx in tqdm(word_index.items()):
    if idx >= max_features: 
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[idx] = embedding_vector


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


preprocessing
lowercase
removing breaks
expanding contractions
replacing smileys
replacing ip
removing links
replacing numbers
removing bigrams
isolating punct
preprocessing
lowercase
removing breaks
expanding contractions
replacing smileys
replacing ip
removing links
replacing numbers
removing bigrams
isolating punct


2000001it [01:31, 21953.52it/s]
100%|██████████| 377039/377039 [00:00<00:00, 1153811.38it/s]


In [2]:
from keras.layers import Dense, Input, Embedding, Dropout, Bidirectional, GRU, Flatten, SpatialDropout1D, MaxPool1D,Concatenate

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch + 1, score))


def squash(x, axis=-1):
    # s_squared_norm is really small
    # s_squared_norm = K.sum(K.square(x), axis, keepdims=True) + K.epsilon()
    # scale = K.sqrt(s_squared_norm)/ (0.5 + s_squared_norm)
    # return scale * x
    s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
    scale = K.sqrt(s_squared_norm + K.epsilon())
    return x / scale

class AttentionWeightedAverage(Layer):
    """
    Computes a weighted average of the different channels across timesteps.
    Uses 1 parameter pr. channel to compute the attention value for a single timestep.
    """

    def __init__(self, return_attention=False, **kwargs):
        self.init = initializers.get('uniform')
        self.supports_masking = True
        self.return_attention = return_attention
        super(AttentionWeightedAverage, self).__init__(**kwargs)

    def build(self, input_shape):
        self.input_spec = [InputSpec(ndim=3)]
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[2], 1),
                                 name='{}_W'.format(self.name),
                                 initializer=self.init)
        self.trainable_weights = [self.W]
        super(AttentionWeightedAverage, self).build(input_shape)

    def call(self, x, mask=None):
        # computes a probability distribution over the timesteps
        # uses 'max trick' for numerical stability
        # reshape is done to avoid issue with Tensorflow
        # and 1-dimensional weights
        logits = K.dot(x, self.W)
        x_shape = K.shape(x)
        logits = K.reshape(logits, (x_shape[0], x_shape[1]))
        ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))

        # masked timesteps have zero weight
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            ai = ai * mask
        att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
        weighted_input = x * K.expand_dims(att_weights)
        result = K.sum(weighted_input, axis=1)
        if self.return_attention:
            return [result, att_weights]
        return result

    def get_output_shape_for(self, input_shape):
        return self.compute_output_shape(input_shape)

    def compute_output_shape(self, input_shape):
        output_len = input_shape[2]
        if self.return_attention:
            return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
        return (input_shape[0], output_len)

    def compute_mask(self, input, input_mask=None):
        if isinstance(input_mask, list):
            return [None] * len(input_mask)
        else:
            return None

# A Capsule Implement with Pure Keras
class Capsule(Layer):
    def __init__(self, num_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True,
                 activation='default', **kwargs):
        super(Capsule, self).__init__(**kwargs)
        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = squash
        else:
            self.activation = Activation(activation)

    def build(self, input_shape):
        super(Capsule, self).build(input_shape)
        input_dim_capsule = input_shape[-1]
        if self.share_weights:
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(1, input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     # shape=self.kernel_size,
                                     initializer='glorot_uniform',
                                     trainable=True)
        else:
            input_num_capsule = input_shape[-2]
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(input_num_capsule,
                                            input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     initializer='glorot_uniform',
                                     trainable=True)

    def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
            if i < self.routings - 1:
                b = K.batch_dot(outputs, u_hat_vecs, [2, 3])

        return outputs

    def compute_output_shape(self, input_shape):
        return (None, self.num_capsule, self.dim_capsule)

In [40]:
from keras.layers import Dropout, MaxPooling1D, Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, add,BatchNormalization
from keras.layers import Dense, Embedding, Input, Bidirectional, concatenate, PReLU, SpatialDropout1D, Activation
from keras.optimizers import Adam, RMSprop, Nadam
from keras.models import Model, load_model

act, pad, kernel_ini = "linear", "same", "he_uniform"
def build_model(units = 0, k = 0, num_block = 0, lr = 0.0, dr = 0.0):
    inp = Input(shape = (maxlen, ))
    emb = Embedding(nb_words, embed_size, weights = [embedding_matrix], 
                    input_length = maxlen, trainable = False)(inp)
    emb = SpatialDropout1D(dr)(emb)
    #capsule = Capsule(num_capsule=256, dim_capsule=8, routings=3)(emb)
    capsule = Capsule(num_capsule=128, dim_capsule=32, routings=3)(emb)
    capsule = Capsule(num_capsule=64, dim_capsule=32, routings=3)(capsule)
    capsule = Capsule(num_capsule=32, dim_capsule=32, routings=3)(capsule)
    capsule = Capsule(num_capsule=16, dim_capsule=32, routings=3)(capsule)
    capsule = Capsule(num_capsule=8, dim_capsule=32, routings=3)(capsule)
    
    out_put = Flatten()(capsule)

    out_put = Dense(6, activation = "sigmoid")(out_put)
    model = Model(inputs = inp, outputs = out_put)
    #model.compile(loss = "binary_crossentropy", optimizer = Adam(lr=lr), metrics = ["accuracy"])
    model.compile(loss = "binary_crossentropy", optimizer = Adam(lr=lr), metrics = ["accuracy"])
    #lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False
    return model

In [26]:
from keras.layers import Dropout, MaxPooling1D, Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, add,BatchNormalization
from keras.layers import Dense, Embedding, Input, Bidirectional, concatenate, PReLU, SpatialDropout1D, Activation
from keras.optimizers import Adam, RMSprop, Nadam
from keras.models import Model, load_model

act, pad, kernel_ini = "linear", "same", "he_uniform"
def build_model(units = 0, k = 0, num_block = 0, lr = 0.0, dr = 0.0):
    inp = Input(shape = (maxlen, ))
    emb = Embedding(nb_words, embed_size, weights = [embedding_matrix], 
                    input_length = maxlen, trainable = False)(inp)
    emb = SpatialDropout1D(dr)(emb)
    capsule = Capsule(num_capsule=128, dim_capsule=16, routings=3)(emb)
    capsule = Capsule(num_capsule=64, dim_capsule=16, routings=3)(capsule)
    capsule = Capsule(num_capsule=32, dim_capsule=16, routings=3)(capsule)
    capsule = Capsule(num_capsule=16, dim_capsule=16, routings=3)(capsule)
    capsule = Capsule(num_capsule=8, dim_capsule=16, routings=3)(capsule)
    
    out_put = Flatten()(capsule)

    out_put = Dense(6, activation = "sigmoid")(out_put)
    model = Model(inputs = inp, outputs = out_put)
    #model.compile(loss = "binary_crossentropy", optimizer = Adam(lr=lr), metrics = ["accuracy"])
    model.compile(loss = "binary_crossentropy", optimizer = Adam(lr=lr), metrics = ["accuracy"])
    #lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False
    return model

In [41]:
model = build_model(units = 256, k = 3, num_block = 2, lr = 0.001, dr = 0.2)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_35 (InputLayer)        (None, 200)               0         
_________________________________________________________________
embedding_35 (Embedding)     (None, 200, 300)          45000000  
_________________________________________________________________
spatial_dropout1d_35 (Spatia (None, 200, 300)          0         
_________________________________________________________________
capsule_135 (Capsule)        (None, 128, 32)           1228800   
_________________________________________________________________
capsule_136 (Capsule)        (None, 64, 32)            65536     
_________________________________________________________________
capsule_137 (Capsule)        (None, 32, 32)            32768     
_________________________________________________________________
capsule_138 (Capsule)        (None, 16, 32)            16384     
__________

In [43]:
fold_count = 10
fold_size = len(X) // 10
for fold_id in range(0, fold_count):
    fold_start = fold_size * fold_id
    fold_end = fold_start + fold_size

    if fold_id == fold_size - 1:
        fold_end = len(X)

    X_valid = X[fold_start:fold_end]
    Y_valid = Y[fold_start:fold_end]
    X_train = np.concatenate([X[:fold_start], X[fold_end:]])
    Y_train = np.concatenate([Y[:fold_start], Y[fold_end:]])

    model = build_model(units = 256, k = 3, num_block = 3, lr = 0.001, dr = 0.2)
    file_path = "DPCNN_CAPS_%s_.hdf5" %fold_id
    early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)
    ra_val = RocAucEvaluation(validation_data = (X_valid, Y_valid), interval = 1)
    check_point = ModelCheckpoint(file_path, monitor = "val_loss", mode = "min", save_best_only = True, verbose = 1)
    history = model.fit(X_train, Y_train, batch_size = 64, epochs = 10, validation_data = (X_valid, Y_valid), 
                    verbose = 1, callbacks = [ra_val, early_stop, check_point])

Train on 143614 samples, validate on 15957 samples
Epoch 1/10


ResourceExhaustedError: OOM when allocating tensor with shape[32,128,200,32]
	 [[Node: training_22/Adam/gradients/capsule_145/MatMul_grad/MatMul_1 = BatchMatMul[T=DT_FLOAT, _class=["loc:@capsule_145/MatMul"], adj_x=false, adj_y=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](capsule_145/Reshape_3, training_22/Adam/gradients/capsule_145/Squeeze_grad/Reshape)]]

Caused by op 'training_22/Adam/gradients/capsule_145/MatMul_grad/MatMul_1', defined at:
  File "/home/christof/miniconda3/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/christof/miniconda3/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/christof/miniconda3/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/christof/miniconda3/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 486, in start
    self.io_loop.start()
  File "/home/christof/miniconda3/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 112, in start
    self.asyncio_loop.run_forever()
  File "/home/christof/miniconda3/lib/python3.6/asyncio/base_events.py", line 421, in run_forever
    self._run_once()
  File "/home/christof/miniconda3/lib/python3.6/asyncio/base_events.py", line 1426, in _run_once
    handle._run()
  File "/home/christof/miniconda3/lib/python3.6/asyncio/events.py", line 127, in _run
    self._callback(*self._args)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 102, in _handle_events
    handler_func(fileobj, events)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/home/christof/miniconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/christof/miniconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-43-ea2343eba21b>", line 21, in <module>
    verbose = 1, callbacks = [ra_val, early_stop, check_point])
  File "/home/christof/miniconda3/lib/python3.6/site-packages/keras/engine/training.py", line 1646, in fit
    self._make_train_function()
  File "/home/christof/miniconda3/lib/python3.6/site-packages/keras/engine/training.py", line 970, in _make_train_function
    loss=self.total_loss)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/keras/legacy/interfaces.py", line 91, in wrapper
    return func(*args, **kwargs)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/keras/optimizers.py", line 434, in get_updates
    grads = self.get_gradients(loss, params)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/keras/optimizers.py", line 78, in get_gradients
    grads = K.gradients(loss, params)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 2512, in gradients
    return tf.gradients(loss, variables, colocate_gradients_with_ops=True)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py", line 581, in gradients
    grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
  File "/home/christof/miniconda3/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py", line 353, in _MaybeCompile
    return grad_fn()  # Exit early
  File "/home/christof/miniconda3/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py", line 581, in <lambda>
    grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
  File "/home/christof/miniconda3/lib/python3.6/site-packages/tensorflow/python/ops/math_grad.py", line 1027, in _BatchMatMul
    grad_y = math_ops.matmul(x, grad, adjoint_a=False, adjoint_b=False)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py", line 1861, in matmul
    a, b, adj_x=adjoint_a, adj_y=adjoint_b, name=name)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/tensorflow/python/ops/gen_math_ops.py", line 709, in _batch_mat_mul
    "BatchMatMul", x=x, y=y, adj_x=adj_x, adj_y=adj_y, name=name)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2956, in create_op
    op_def=op_def)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1470, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

...which was originally created as op 'capsule_145/MatMul', defined at:
  File "/home/christof/miniconda3/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
[elided 21 identical lines from previous traceback]
  File "/home/christof/miniconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-43-ea2343eba21b>", line 15, in <module>
    model = build_model(units = 256, k = 3, num_block = 3, lr = 0.001, dr = 0.2)
  File "<ipython-input-40-3e92db08e00e>", line 13, in build_model
    capsule = Capsule(num_capsule=128, dim_capsule=32, routings=3)(emb)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/keras/engine/topology.py", line 617, in __call__
    output = self.call(inputs, **kwargs)
  File "<ipython-input-2-f1c5d115e543>", line 137, in call
    outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
  File "/home/christof/miniconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 1151, in batch_dot
    out = tf.matmul(x, y, adjoint_a=adj_x, adjoint_b=adj_y)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py", line 1861, in matmul
    a, b, adj_x=adjoint_a, adj_y=adjoint_b, name=name)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/tensorflow/python/ops/gen_math_ops.py", line 709, in _batch_mat_mul
    "BatchMatMul", x=x, y=y, adj_x=adj_x, adj_y=adj_y, name=name)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2956, in create_op
    op_def=op_def)
  File "/home/christof/miniconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1470, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[32,128,200,32]
	 [[Node: training_22/Adam/gradients/capsule_145/MatMul_grad/MatMul_1 = BatchMatMul[T=DT_FLOAT, _class=["loc:@capsule_145/MatMul"], adj_x=false, adj_y=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](capsule_145/Reshape_3, training_22/Adam/gradients/capsule_145/Squeeze_grad/Reshape)]]


In [13]:
from global_variables import LIST_CLASSES
list_of_preds = []
list_of_vals = []
list_of_y = []
fold_count = 10
fold_size = len(X) // 10
for fold_id in range(0, fold_count):
    fold_start = fold_size * fold_id
    fold_end = fold_start + fold_size

    if fold_id == 9:
        fold_end = len(X)

    X_valid = X[fold_start:fold_end]
    Y_valid = Y[fold_start:fold_end]
    X_train = np.concatenate([X[:fold_start], X[fold_end:]])
    Y_train = np.concatenate([Y[:fold_start], Y[fold_end:]])

    file_path = 'DPCNN_3216_' + str(fold_id) + '_.hdf5'
    #model = build_model(lr = 0.001)
    #model.load_weights(file_path)
    model = load_model(file_path)
    preds = model.predict(X_test, batch_size = 256, verbose = 1)
    list_of_preds.append(preds)
    vals = model.predict(X_valid, batch_size = 256, verbose = 1)
    list_of_vals.append(vals)
    list_of_y.append(Y_valid)
test_predicts = np.zeros(list_of_preds[0].shape)
for fold_predict in list_of_preds:
    test_predicts += fold_predict

test_predicts /= len(list_of_preds)
submission = pd.read_csv('assets/raw_data/sample_submission.csv')
submission[LIST_CLASSES] = test_predicts
submission.to_csv('DPCNN_3216_l2_test_data.csv', index=False)

l2_data = pd.DataFrame(columns=['logits_' + c for c in LIST_CLASSES]+LIST_CLASSES)
l2_data[['logits_' + c for c in LIST_CLASSES]] = pd.DataFrame(np.concatenate(list_of_vals,axis = 0))
l2_data[LIST_CLASSES] = pd.DataFrame(np.concatenate(list_of_y,axis = 0))
l2_data.to_csv('DPCNN_3216_l2_train_data.csv')

