In [1]:
from toxic.model import get_model
from toxic.nltk_utils import tokenize_sentences
from toxic.train_utils import train_folds
from toxic.embedding_utils import read_embedding_list, clear_embedding_list, convert_tokens_to_ids

import argparse
import numpy as np
import os
import pandas as pd

import nltk

#nltk.download('punkt')

UNKNOWN_WORD = "_UNK_"
END_WORD = "_END_"
NAN_WORD = "_NAN_"

CLASSES = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

PROBABILITIES_NORMALIZE_COEFFICIENT = 1.4

parser = argparse.ArgumentParser(
    description="Recurrent neural network for identifying and classifying toxic online comments")

parser.add_argument("train_file_path", default='../data/train.csv')
parser.add_argument("test_file_path", default='../data/test.csv')
parser.add_argument("embedding_path", default='crawl-300d-2M.vec')
parser.add_argument("--result-path", default="toxic_results")
parser.add_argument("--batch-size", type=int, default=256)
parser.add_argument("--sentences-length", type=int, default=500)
parser.add_argument("--recurrent-units", type=int, default=64)
parser.add_argument("--dropout-rate", type=float, default=0.3)
parser.add_argument("--dense-size", type=int, default=32)
parser.add_argument("--fold-count", type=int, default=10)

#args = parser.parse_args()

# To get all defaults:
args_list = ["train_file_path","test_file_path","embedding_path","result-path","batch-size","sentences-length","recurrent-units","dropout-rate","dense-size","fold-count"]
args = {}
for key in args_list:
    args[key] = parser.get_default(key)
print(args)
#args = {
 #   "train_file_path": '',
  #  "test_file_path": '',
   # "embedding_path": '',
    #"--result-path": '',
   # "--batch-size": '',
   # "--sentences-length": '',
   # "--recurrent-units": '',
   # "--dropout-rate": '',
   # "--dense-size": '',
   # "--fold-count": ''
#}

#if args["fold_count"] <= 1:
#    raise ValueError("fold-count should be more than 1")

print("Loading data...")
train_data = pd.read_csv(args["train_file_path"])
test_data = pd.read_csv(args["test_file_path"])

list_sentences_train = train_data["comment_text"].fillna(NAN_WORD).values
list_sentences_test = test_data["comment_text"].fillna(NAN_WORD).values
y_train = train_data[CLASSES].values

print("Tokenizing sentences in train set...")
tokenized_sentences_train, words_dict = tokenize_sentences(list_sentences_train, {})

print("Tokenizing sentences in test set...")
tokenized_sentences_test, words_dict = tokenize_sentences(list_sentences_test, words_dict)

Using TensorFlow backend.


{'batch-size': None, 'sentences-length': None, 'recurrent-units': None, 'fold-count': None, 'dropout-rate': None, 'embedding_path': 'crawl-300d-2M.vec', 'dense-size': None, 'result-path': None, 'train_file_path': '../data/train.csv', 'test_file_path': '../data/test.csv'}
Loading data...


  0%|          | 134/159571 [00:00<01:59, 1339.58it/s]

Tokenizing sentences in train set...


100%|██████████| 159571/159571 [01:51<00:00, 1437.11it/s]
  0%|          | 146/153164 [00:00<01:45, 1451.94it/s]

Tokenizing sentences in test set...


100%|██████████| 153164/153164 [01:41<00:00, 1509.43it/s]


In [2]:
words_dict[UNKNOWN_WORD] = len(words_dict)

from toxic.embedding_utils import read_embedding_list

print("Loading embeddings...")
embedding_list, embedding_word_dict = read_embedding_list(args["embedding_path"])
embedding_size = len(embedding_list[0])

print("Preparing data...")
embedding_list, embedding_word_dict = clear_embedding_list(embedding_list, embedding_word_dict, words_dict)

embedding_word_dict[UNKNOWN_WORD] = len(embedding_word_dict)
embedding_list.append([0.] * embedding_size)
embedding_word_dict[END_WORD] = len(embedding_word_dict)
embedding_list.append([-1.] * embedding_size)

embedding_matrix = np.array(embedding_list)

2452it [00:00, 12218.15it/s]

Loading embeddings...


1999995it [02:53, 11515.25it/s]


Preparing data...


In [3]:
args = {
    "train_file_path": '../data/train.csv',
    "test_file_path": '../data/test.csv',
    "embedding_path": 'crawl-300d-2M.vec',
    "result_path": "toxic_results",
    "batch_size": 256,
    "sentences_length": 500,
    "recurrent_units": 64,
    "dropout_rate": 0.3,
    "dense_size": 32,
    "fold_count": 10
}

In [4]:
from toxic.train_utils import train_folds

id_to_word = dict((id, word) for word, id in words_dict.items())
train_list_of_token_ids = convert_tokens_to_ids(
    tokenized_sentences_train,
    id_to_word,
    embedding_word_dict,
    args["sentences_length"])
test_list_of_token_ids = convert_tokens_to_ids(
    tokenized_sentences_test,
    id_to_word,
    embedding_word_dict,
    args["sentences_length"])
X_train = np.array(train_list_of_token_ids)
X_test = np.array(test_list_of_token_ids)

get_model_func = lambda: get_model(
    embedding_matrix,
    args["sentences_length"],
    args["dropout_rate"],
    args["recurrent_units"],
    args["dense_size"])

In [5]:
y_train = np.array(train_data[CLASSES].values)
print(args['fold_count'])
print(args['batch_size'])
print([args["sentences_length"],
    args["dropout_rate"],
    args["recurrent_units"],
    args["dense_size"]])


print("Starting to train models...")
models = train_folds(X_train, y_train, args["fold_count"], args["batch_size"], get_model_func)

if not os.path.exists(args["result_path"]):
    os.mkdir(args["result_path"])

10
256
[500, 0.3, 64, 32]
Starting to train models...
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
model
<keras.engine.training.Model object at 0x7f624365fb38>
train_x
[[143313   5424  19131 ... 170209 170209 170209]
 [  5200 134395 154851 ... 170209 170209 170209]
 [170208 170208  73730 ... 170209 170209 170209]
 ...
 [ 47569  61919 134223 ... 170209 170209 170209]
 [143587 162708   9548 ... 170209 170209 170209]
 [170208 143587 103032 ... 170209 170209 170209]]
[[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 ...
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]
fitting...
Instructions for updating:
Use tf.cast instead.
Epoch 1/1


InvalidArgumentError: No OpKernel was registered to support Op 'CudnnRNN' used by node bidirectional_1/CudnnRNN (defined at /usr/local/lib/python3.5/dist-packages/keras/layers/cudnn_recurrent.py:297) with these attrs: [seed=87654321, dropout=0, T=DT_FLOAT, input_mode="linear_input", direction="unidirectional", rnn_mode="gru", is_training=true, seed2=0]
Registered devices: [CPU, XLA_CPU]
Registered kernels:
  <no registered kernels>

	 [[node bidirectional_1/CudnnRNN (defined at /usr/local/lib/python3.5/dist-packages/keras/layers/cudnn_recurrent.py:297) ]]

Caused by op 'bidirectional_1/CudnnRNN', defined at:
  File "/usr/lib/python3.5/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.5/dist-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelapp.py", line 505, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.5/dist-packages/tornado/platform/asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.5/asyncio/base_events.py", line 421, in run_forever
    self._run_once()
  File "/usr/lib/python3.5/asyncio/base_events.py", line 1424, in _run_once
    handle._run()
  File "/usr/lib/python3.5/asyncio/events.py", line 126, in _run
    self._callback(*self._args)
  File "/usr/local/lib/python3.5/dist-packages/tornado/ioloop.py", line 758, in _run_callback
    ret = callback()
  File "/usr/local/lib/python3.5/dist-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/tornado/gen.py", line 1233, in inner
    self.run()
  File "/usr/local/lib/python3.5/dist-packages/tornado/gen.py", line 1147, in run
    yielded = self.gen.send(value)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 370, in dispatch_queue
    yield self.process_one()
  File "/usr/local/lib/python3.5/dist-packages/tornado/gen.py", line 346, in wrapper
    runner = Runner(result, future, yielded)
  File "/usr/local/lib/python3.5/dist-packages/tornado/gen.py", line 1080, in __init__
    self.run()
  File "/usr/local/lib/python3.5/dist-packages/tornado/gen.py", line 1147, in run
    yielded = self.gen.send(value)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 357, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "/usr/local/lib/python3.5/dist-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 267, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "/usr/local/lib/python3.5/dist-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 534, in execute_request
    user_expressions, allow_stdin,
  File "/usr/local/lib/python3.5/dist-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/ipkernel.py", line 294, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2819, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2845, in _run_cell
    return runner(coro)
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/async_helpers.py", line 67, in _pseudo_sync_runner
    coro.send(None)
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 3020, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 3185, in run_ast_nodes
    if (yield from self.run_code(code, result)):
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-5-c96012a1a4db>", line 11, in <module>
    models = train_folds(X_train, y_train, args["fold_count"], args["batch_size"], get_model_func)
  File "/root/ml-project/ml-pro/Pavel/toxic/train_utils.py", line 62, in train_folds
    model = _train_model(get_model_func(), batch_size, train_x, train_y, val_x, val_y)
  File "<ipython-input-4-bb92dcf4fe4e>", line 22, in <lambda>
    args["dense_size"])
  File "/root/ml-project/ml-pro/Pavel/toxic/model.py", line 11, in get_model
    x = Bidirectional(CuDNNGRU(recurrent_units, return_sequences=True))(embedding_layer)
  File "/usr/local/lib/python3.5/dist-packages/keras/layers/wrappers.py", line 427, in __call__
    return super(Bidirectional, self).__call__(inputs, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/keras/engine/base_layer.py", line 457, in __call__
    output = self.call(inputs, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/keras/layers/wrappers.py", line 522, in call
    y = self.forward_layer.call(inputs, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/keras/layers/cudnn_recurrent.py", line 90, in call
    output, states = self._process_batch(inputs, initial_state)
  File "/usr/local/lib/python3.5/dist-packages/keras/layers/cudnn_recurrent.py", line 297, in _process_batch
    is_training=True)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py", line 1713, in __call__
    seed=self._seed)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py", line 1102, in _cudnn_rnn_no_input_c
    direction, dropout, seed, name)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py", line 1014, in _cudnn_rnn
    outputs, output_h, output_c, _ = gen_cudnn_rnn_ops.cudnn_rnn(**args)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gen_cudnn_rnn_ops.py", line 142, in cudnn_rnn
    seed2=seed2, is_training=is_training, name=name)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 788, in _apply_op_helper
    op_def=op_def)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/util/deprecation.py", line 501, in new_func
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 3300, in create_op
    op_def=op_def)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 1801, in __init__
    self._traceback = tf_stack.extract_stack()

InvalidArgumentError (see above for traceback): No OpKernel was registered to support Op 'CudnnRNN' used by node bidirectional_1/CudnnRNN (defined at /usr/local/lib/python3.5/dist-packages/keras/layers/cudnn_recurrent.py:297) with these attrs: [seed=87654321, dropout=0, T=DT_FLOAT, input_mode="linear_input", direction="unidirectional", rnn_mode="gru", is_training=true, seed2=0]
Registered devices: [CPU, XLA_CPU]
Registered kernels:
  <no registered kernels>

	 [[node bidirectional_1/CudnnRNN (defined at /usr/local/lib/python3.5/dist-packages/keras/layers/cudnn_recurrent.py:297) ]]


In [None]:
print("Predicting results...")
test_predicts_list = []
for fold_id, model in enumerate(models):
    model_path = os.path.join(args["result_path"], "model{0}_weights.npy".format(fold_id))
    np.save(model_path, model.get_weights())

    test_predicts_path = os.path.join(args["result_path"], "test_predicts{0}.npy".format(fold_id))
    test_predicts = model.predict(X_test, batch_size=args["batch_size"])
    test_predicts_list.append(test_predicts)
    np.save(test_predicts_path, test_predicts)

test_predicts = np.ones(test_predicts_list[0].shape)
for fold_predict in test_predicts_list:
    test_predicts *= fold_predict

test_predicts **= (1. / len(test_predicts_list))
test_predicts **= PROBABILITIES_NORMALIZE_COEFFICIENT

test_ids = test_data["id"].values
test_ids = test_ids.reshape((len(test_ids), 1))

test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES)
test_predicts["id"] = test_ids
test_predicts = test_predicts[["id"] + CLASSES]
submit_path = os.path.join(args["result_path"], "submit")
test_predicts.to_csv(submit_path, index=False)
