In [1]:
%matplotlib inline
seed = 1234
import pandas as pd
import os, glob
import numpy as np
np.random.seed(seed)
import random
random.seed(seed)
# fix random seed for reproducibility
import tensorflow as tf
tf.compat.v1.random.set_random_seed(seed)
import keras
from numpy import array
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import text_utilities as tu
import modeling_utils as mu
import image_utilities as iu
import ocr
import doc_classifier_model as dcm
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
from keras import backend as K
import json

sess_config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1,
allow_soft_placement=True, device_count = {'CPU': 1})
sess = tf.Session(graph=tf.get_default_graph(),config=sess_config)
K.set_session(sess)

plt.style.use('ggplot')


Using TensorFlow backend.


In [2]:
print(tf.__version__)
print(keras.__version__)

1.15.0
2.3.0


## **OCR function applied on images**

In [3]:
#data_path = '/home/sureclaim/Documents/Claims/real_data/'
#data_df = ocr.create_data_df(data_path)
#data_df.to_csv('/home/sureclaim/Documents/Claims/data_df.csv')

In [4]:
data_df = pd.read_csv('data/data_df_new.csv')
#data_df = pd.read_csv('/home/sureclaim/Documents/Claims/data_df.csv')
data_df.shape

(1946, 5)

In [5]:
print("NA counts:", data_df.isna().sum())
data_df = data_df.dropna()

NA counts: Unnamed: 0     0
filename       0
path           0
x_txt         65
y              0
dtype: int64


# DV distribution

In [6]:
# One hot encode category
data_df['y_oneh'] = mu.onehot_encode(data_df.y)
mu.get_dv_dist(data_df, 'filename', 'y')

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


y,Aadhar Card,Diagnostic Bill,Discharge Summary,Insurance Card,Internal Case Papers,Pan Card,Phramacy Bill,Policy Copy,Prescriptions,Receipts
counts,112.0,114.0,268.0,111.0,277.0,46.0,306.0,145.0,312.0,190.0
perc (%),5.95,6.06,14.25,5.9,14.73,2.45,16.27,7.71,16.59,10.1


# Make Vocab

In [7]:
# Clean the text column; keep only alphabets
data_df['x_txt_cleaned'] = data_df.x_txt.apply(tu.clean_string)

# Setting max padded sequence length = 70
max_len = 70
vocab = tu.make_vocab(data_df.x_txt_cleaned)
w2i = tu.make_w2i(vocab)

Vocab size: 29366


# Train - Test Split

In [8]:
# Shuffle dataframe
data_df = data_df.sample(frac=1)

# Get word level splits
x_train, x_test, y_train, y_test = train_test_split(data_df[['path', 'x_txt_cleaned']], data_df.y_oneh, test_size=0.3, random_state=201)

print("----------------------------------------")
print("Train X Size:", x_train.shape)
print("Train Y Size:", y_train.shape)
print("----------------------------------------")
print("Test X size:", x_test.shape)
print("Test Y Size:", y_test.shape)

# TODO show DV dist for every sample

----------------------------------------
Train X Size: (1316, 2)
Train Y Size: (1316,)
----------------------------------------
Test X size: (565, 2)
Test Y Size: (565,)


# Make Tensors

In [9]:
# make tensors
x_txt_train = tu.make_tensor_np(x_train.x_txt_cleaned, w2i, max_len)
#x_img_train = np.stack(x_train.path.apply(rfe.generate_image_features, args=[resnet50]))
#np.save('data/img_features_train_new.npy', x_img_train)
x_img_train = np.load('data/img_features_train.npy')
y_train = np.stack(y_train.to_numpy())

x_txt_test = tu.make_tensor_np(x_test.x_txt_cleaned, w2i, max_len)
#x_img_test = np.stack(x_test.path.apply(rfe.generate_image_features, args=[resnet50]))
#np.save('data/img_features_test_new.npy', x_img_test)
x_img_test = np.load('data/img_features_test.npy')
y_test = np.stack(y_test.to_numpy())

print("X text train tensor shape:", x_txt_train.shape)
print("X images train tensor shape:", x_img_train.shape)
print("y train shape:", y_train.shape)
print("----------------------------------------")
print("X text test tensor shape:", x_txt_test.shape)
print("X images test tensor shape:", x_img_test.shape)
print("y test shape:", y_test.shape)

X text train tensor shape: (1316, 70)
X images train tensor shape: (1316, 131072)
y train shape: (1316, 10)
----------------------------------------
X text test tensor shape: (565, 70)
X images test tensor shape: (565, 131072)
y test shape: (565, 10)


# DV Distribution by Dataset

In [10]:
# TODO: make reports, write to csv
print("Train DV distribution:----------------------------------------------------------------------- ")
print(mu.get_dv_dist(y_train))
print("Test DV distribution:------------------------------------------------------------------------ ")
print(mu.get_dv_dist(y_test))

Train DV distribution:----------------------------------------------------------------------- 
            0     1      2     3      4     5      6      7      8      9
counts   85.0  77.0  191.0  78.0  199.0  33.0  213.0  107.0  205.0  128.0
perc(%)   6.5   5.9   14.5   5.9   15.1   2.5   16.2    8.1   15.6    9.7
Test DV distribution:------------------------------------------------------------------------ 
            0     1     2     3     4     5     6     7      8     9
counts   27.0  37.0  77.0  33.0  78.0  13.0  93.0  38.0  107.0  62.0
perc(%)   4.8   6.5  13.6   5.8  13.8   2.3  16.5   6.7   18.9  11.0


# Define callbacks

In [11]:

n_classes = y_test.shape[1]
vocab_size = len(vocab)

config = dict({'input_text_shape': max_len, 'input_img_shape': x_img_train.shape[1], 'n_classes': n_classes, 'vocab_size': vocab_size, 'w2i':w2i})

with open('model_config', 'w') as configfile:
    json.dump(config, configfile, indent=2)

doc_classifier = dcm.build_doc_classifier(max_len, x_img_train.shape[1], n_classes, vocab_size)

## Defining keras callbacks
log_dir = 'models/tf-log/'
tb_cb = TensorBoard(log_dir=log_dir, histogram_freq=0)

model_checkpoint = ModelCheckpoint('models/doc_classifier.ckpt', monitor='f1', save_weights_only=True, mode = 'max', save_best_only=True, verbose=2)
early_stopping = EarlyStopping(monitor='f1', mode = 'max', patience=30, verbose=2)
reduce_lr = ReduceLROnPlateau(monitor='f1', mode = 'max', factor=0.5, patience=5, min_lr=0.00001, verbose=2)
cbks = [tb_cb, early_stopping, reduce_lr, model_checkpoint]


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


# Train

In [12]:
# Add an op to initialize the variables.
init_op = tf.global_variables_initializer()

# Add ops to save and restore all the variables.
saver = tf.train.Saver()

# Later, launch the model, initialize the variables, do some work, and save the
# variables to disk.
with tf.Session() as sess:
    sess.run(init_op)
    # Do some work with the model
    # train LSTM
    history = doc_classifier.fit([x_img_train, x_txt_train], y_train,
                    batch_size=256, 
                    epochs=30, 
                    shuffle=False,
                    validation_split=0.1,
                    callbacks=cbks,
                    verbose=1)
    save_path = saver.save(sess, "models/sess2.ckpt")
    print("Sess saved in path: %s" % save_path)
    
    ##Save model weights
    doc_classifier.save_weights('models/doc_classifier2.h5')
    
    model_json = doc_classifier.to_json()
    with open('model2.json', 'w') as json_file:
        json_file.write(model_json)
    doc_classifier.save_weights('model2.h5')
    
    loss, accuracy, f1 = doc_classifier.evaluate([x_img_train, x_txt_train], y_train, verbose=2)
    print("Training Accuracy: ", round(accuracy*100, 2),"%" )
    print("Training F1: ", round(f1*100, 2),"%" )
    print("Training Loss: ", loss )
    print(mu.get_model_metrics(doc_classifier, [x_img_train, x_txt_train], y_train))
    print('-----------------------------------------------------------------------------------')
    loss, accuracy, f1 = doc_classifier.evaluate([x_img_test, x_txt_test], y_test, verbose=2)
    print("Test Accuracy: ", round(accuracy*100, 2),"%" )
    print("Test F1: ", round(f1*100, 2),"%" )
    print("Test Loss: ", loss )
    print(mu.get_model_metrics(doc_classifier, [x_img_test, x_txt_test], y_test))

#mu.plot_history(history)


Train on 1184 samples, validate on 132 samples


Epoch 1/30


Epoch 00001: f1 improved from -inf to 0.16718, saving model to models/doc_classifier.ckpt
Epoch 2/30

Epoch 00002: f1 improved from 0.16718 to 0.28756, saving model to models/doc_classifier.ckpt
Epoch 3/30

Epoch 00003: f1 improved from 0.28756 to 0.38411, saving model to models/doc_classifier.ckpt
Epoch 4/30

Epoch 00004: f1 improved from 0.38411 to 0.45441, saving model to models/doc_classifier.ckpt
Epoch 5/30

Epoch 00005: f1 improved from 0.45441 to 0.51989, saving model to models/doc_classifier.ckpt
Epoch 6/30

Epoch 00006: f1 improved from 0.51989 to 0.58311, saving model to models/doc_classifier.ckpt
Epoch 7/30

Epoch 00007: f1 improved from 0.58311 to 0.65214, saving model to models/doc_classifier.ckpt
Epoch 8/30

Epoch 00008: f1 improved from 0.65214 to 0.70201, saving model to models/doc_classifier.ckpt
Epoch 9/30

Epoch 00009: f1 improved from 0.70201 to 0.75113, saving model to models/doc_classifier.ckpt
Epoch 1


Epoch 00028: f1 did not improve from 0.98657
Epoch 29/30

Epoch 00029: f1 did not improve from 0.98657
Epoch 30/30

Epoch 00030: f1 improved from 0.98657 to 0.99050, saving model to models/doc_classifier.ckpt
Sess saved in path: models/sess2.ckpt
Training Accuracy:  92.17 %
Training F1:  91.25 %
Training Loss:  0.27510066869411065
              precision    recall  f1-score   support

           0       0.98      0.94      0.96        85
           1       1.00      0.92      0.96        77
           2       0.95      0.93      0.94       191
           3       0.95      0.97      0.96        78
           4       0.92      0.99      0.96       199
           5       0.97      0.94      0.95        33
           6       0.99      0.94      0.96       213
           7       1.00      0.53      0.70       107
           8       0.76      0.98      0.86       205
           9       0.96      0.96      0.96       128

    accuracy                           0.92      1316
   macro avg    

In [20]:
from keras_self_attention import SeqSelfAttention
from keras.optimizers import Adagrad

tf.reset_default_graph()
# Add ops to save and restore all the variables.
#saver = tf.train.Saver()

# Later, launch the model, use the saver to restore variables from disk, and
# do some work with the model.
with tf.Session() as sess:
    
    json_file = open('model.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = keras.models.model_from_json(loaded_model_json, custom_objects={'SeqSelfAttention': SeqSelfAttention})
    loaded_model.load_weights('model.h5')

    opt = Adagrad(lr = 1e-3)
        #sgd = optimizers.SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
    loaded_model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=["accuracy"])

    graph = tf.get_default_graph()
    
    saver = tf.train.Saver()
    # Restore variables from disk.
    saver.restore(sess, "models/sess.ckpt")
    print("Session restored.")
    
    # Re-evaluate the model
    with graph.as_default():
        loss,acc = loaded_model.evaluate([x_img_train, x_txt_train], y_train, verbose=2)
    print("Restored model, train accuracy: {:5.2f}%".format(100*acc))
    with graph.as_default():
        loss,acc = loaded_model.evaluate([x_img_test, x_txt_test], y_test, verbose=2)
    print("Restored model, test accuracy: {:5.2f}%".format(100*acc))

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from models/sess.ckpt
Session restored.
Restored model, train accuracy: 88.83%
Restored model, test accuracy: 37.52%


## Get model evaluation metrics


In [15]:
loss, accuracy, f1 = doc_classifier.evaluate([x_img_train, x_txt_train], y_train, verbose=2)
print("Training Accuracy: ", round(accuracy*100, 2),"%" )
print("Training F1: ", round(f1*100, 2),"%" )
print("Training Loss: ", loss )
print(mu.get_model_metrics(doc_classifier, [x_img_train, x_txt_train], y_train))
print('-----------------------------------------------------------------------------------')
loss, accuracy, f1 = doc_classifier.evaluate([x_img_test, x_txt_test], y_test, verbose=2)
print("Test Accuracy: ", round(accuracy*100, 2),"%" )
print("Test F1: ", round(f1*100, 2),"%" )
print("Test Loss: ", loss )
print(mu.get_model_metrics(doc_classifier, [x_img_test, x_txt_test], y_test))

FailedPreconditionError: Attempting to use uninitialized value bidirectional_1/forward_lstm_1/kernel
	 [[node bidirectional_1/forward_lstm_1/kernel/read (defined at /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:376) ]]

Caused by op 'bidirectional_1/forward_lstm_1/kernel/read', defined at:
  File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.6/dist-packages/traitlets/config/application.py", line 664, in launch_instance
    app.start()
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelapp.py", line 563, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.6/dist-packages/tornado/platform/asyncio.py", line 148, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.6/asyncio/base_events.py", line 438, in run_forever
    self._run_once()
  File "/usr/lib/python3.6/asyncio/base_events.py", line 1451, in _run_once
    handle._run()
  File "/usr/lib/python3.6/asyncio/events.py", line 145, in _run
    self._callback(*self._args)
  File "/usr/local/lib/python3.6/dist-packages/tornado/ioloop.py", line 690, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "/usr/local/lib/python3.6/dist-packages/tornado/ioloop.py", line 743, in _run_callback
    ret = callback()
  File "/usr/local/lib/python3.6/dist-packages/tornado/gen.py", line 787, in inner
    self.run()
  File "/usr/local/lib/python3.6/dist-packages/tornado/gen.py", line 748, in run
    yielded = self.gen.send(value)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 361, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "/usr/local/lib/python3.6/dist-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 268, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "/usr/local/lib/python3.6/dist-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 541, in execute_request
    user_expressions, allow_stdin,
  File "/usr/local/lib/python3.6/dist-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/ipkernel.py", line 300, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2855, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2881, in _run_cell
    return runner(coro)
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
    coro.send(None)
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 3058, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 3249, in run_ast_nodes
    if (await self.run_code(code, result,  async_=asy)):
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 3326, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-13-7256a0c6aafb>", line 9, in <module>
    doc_classifier = dcm.build_doc_classifier(max_len, x_img_train.shape[1], n_classes, vocab_size)
  File "/home/sureclaim/Documents/Claims/GPU_files/doc_classifier_model.py", line 67, in build_doc_classifier
    recurrent_dropout=0.2))(txt_attn)
  File "/usr/local/lib/python3.6/dist-packages/keras/engine/topology.py", line 576, in __call__
    self.build(input_shapes[0])
  File "/usr/local/lib/python3.6/dist-packages/keras/layers/wrappers.py", line 320, in build
    self.forward_layer.build(input_shape)
  File "/usr/local/lib/python3.6/dist-packages/keras/layers/recurrent.py", line 445, in build
    self.cell.build(step_input_shape)
  File "/usr/local/lib/python3.6/dist-packages/keras/layers/recurrent.py", line 1707, in build
    constraint=self.kernel_constraint)
  File "/usr/local/lib/python3.6/dist-packages/keras/legacy/interfaces.py", line 87, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/keras/engine/topology.py", line 400, in add_weight
    constraint=constraint)
  File "/usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py", line 376, in variable
    v = tf.Variable(value, dtype=tf.as_dtype(dtype), name=name)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/variables.py", line 213, in __call__
    return cls._variable_v1_call(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/variables.py", line 176, in _variable_v1_call
    aggregation=aggregation)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/variables.py", line 155, in <lambda>
    previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/variable_scope.py", line 2495, in default_variable_creator
    expected_shape=expected_shape, import_scope=import_scope)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/variables.py", line 217, in __call__
    return super(VariableMetaclass, cls).__call__(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/variables.py", line 1395, in __init__
    constraint=constraint)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/variables.py", line 1557, in _init_from_args
    self._snapshot = array_ops.identity(self._variable, name="read")
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/dispatch.py", line 180, in wrapper
    return target(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/array_ops.py", line 81, in identity
    ret = gen_array_ops.identity(input, name=name)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gen_array_ops.py", line 3890, in identity
    "Identity", input=input, name=name)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/op_def_library.py", line 788, in _apply_op_helper
    op_def=op_def)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 3300, in create_op
    op_def=op_def)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 1801, in __init__
    self._traceback = tf_stack.extract_stack()

FailedPreconditionError (see above for traceback): Attempting to use uninitialized value bidirectional_1/forward_lstm_1/kernel
	 [[node bidirectional_1/forward_lstm_1/kernel/read (defined at /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:376) ]]


In [21]:
model_json = doc_classifier.to_json()
with open('model.json', 'w') as json_file:
    json_file.write(model_json)
doc_classifier.save_weights('model.h5')

In [28]:
from keras_self_attention import SeqSelfAttention

#del loaded_model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = keras.models.model_from_json(loaded_model_json, custom_objects={'SeqSelfAttention': SeqSelfAttention})
loaded_model.load_weights('model.h5')

opt = Adam(lr = 1e-3)
#sgd = optimizers.SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
loaded_model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=["accuracy"])

graph = tf.get_default_graph()
# Re-evaluate the model
loss,acc = loaded_model.evaluate([x_img_train, x_txt_train], y_train, verbose=2)
print("Restored model, accuracy: {:5.2f}%".format(100*acc))

Restored model, accuracy: 52.96%


In [18]:
# Create a basic model instance
with open('model_config', 'r') as configfile:
    config = json.load(configfile)
    
model = dcm.build_doc_classifier(config['input_text_shape'], config['input_img_shape'], config['n_classes'], config['vocab_size'])

# Evaluate the model
loss, acc, f1 = model.evaluate([x_img_train, x_txt_train], y_train, verbose=2)
print("Untrained model, accuracy: {:5.2f}%".format(100*acc))


# Loads the weights

model.load_weights("models/doc_classifier.ckpt")

# Re-evaluate the model
loss,acc, f1 = model.evaluate([x_img_train, x_txt_train], y_train, verbose=2)
print("Restored model, accuracy: {:5.2f}%".format(100*acc))

Untrained model, accuracy: 10.03%
Restored model, accuracy: 81.84%
