In [1]:
import image
from pytesseract import image_to_string
import cv2
import os
import pandas as pd
import glob
import numpy as np

In [3]:
# text extraction from form images
labels=['UB','HCFA','USB','MEDICARE','HCFASB']
file_list={}
for label in labels:
    file_paths = glob.glob('../'+label+'/*.tiff')
    for img_name in file_paths:
        img = cv2.imread(img_name)
        h, w = img.shape[:2]
        resized = cv2.resize(img, (w*4, h*4), interpolation = cv2.INTER_LINEAR)
        img_txt = image_to_string(resized)
        file_list[img_name] = [img_txt,label]       

In [4]:
df_file = pd.DataFrame.from_dict(file_list,orient='index')

In [5]:
df_file.columns=['text','label']
df_file['text']=df_file['text'].apply(lambda x: x.replace('\n',' '))
df_file

Unnamed: 0,text,label
../UB/UB2.tiff,I. “V canto cm»: —— “I Wu gm. mam—- A will am...,UB
../UB/UB1.tiff,p804 RED lames K Smith Dr 2 —I u— 2022 WILS...,UB
../UB/UB7.tiff,"“‘LIO‘. Q 1“ 0N~ ION» 0"" ""~ “.4 01M. “UV-lﬁm‘...",UB
../UB/UB8.tiff,10687127 31‘?“ ‘ ASDFASDF I COLLEGE HOSP...,UB
../UB/UB3.tiff,Sample of a Correctly Aligned UB-04 Claim Form...,UB
../UB/UB5.tiff,2 James K Smith Dr 123 W N BEAR CREEK DR LIVE...,UB
../UB/UB6.tiff,"‘Surgery Center 2645 Glass Rd, Ste 205 Weatla...",UB
../UB/UB4.tiff,p804 RED lames K Smith Dr 2 —I u— 2022 WILS...,UB
../HCFA/HCFA19.tiff,"mse — ”Pm who, +1445,“me 0,: (Wm? Lime 0? NOT...",HCFA
../HCFA/HCFA11.tiff,EVE El . HEALTHINSURANGEDLAINFORM .APPRDveD ...,HCFA


In [6]:
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

# encode class values as integers
encoder = LabelEncoder()
encoder.fit(df_file.iloc[:,1])
encoded_Y = encoder.transform(df_file.iloc[:,1])
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

Using TensorFlow backend.
  return f(*args, **kwds)


In [7]:
# dummy_y

In [8]:
df_file['newlabel'] = df_file['label'].str.get_dummies().values.tolist()

In [9]:
one_hot = pd.get_dummies(df_file['label'])
df_file=df_file.join(one_hot)

In [10]:
row_label=pd.DataFrame(df_file.index)
row_label.columns=['File_name']
df_file=pd.DataFrame(np.array(df_file))
df_file

Unnamed: 0,0,1,2,3,4,5,6,7
0,I. “V canto cm»: —— “I Wu gm. mam—- A will am...,UB,"[0, 0, 0, 1, 0]",0,0,0,1,0
1,p804 RED lames K Smith Dr 2 —I u— 2022 WILS...,UB,"[0, 0, 0, 1, 0]",0,0,0,1,0
2,"“‘LIO‘. Q 1“ 0N~ ION» 0"" ""~ “.4 01M. “UV-lﬁm‘...",UB,"[0, 0, 0, 1, 0]",0,0,0,1,0
3,10687127 31‘?“ ‘ ASDFASDF I COLLEGE HOSP...,UB,"[0, 0, 0, 1, 0]",0,0,0,1,0
4,Sample of a Correctly Aligned UB-04 Claim Form...,UB,"[0, 0, 0, 1, 0]",0,0,0,1,0
5,2 James K Smith Dr 123 W N BEAR CREEK DR LIVE...,UB,"[0, 0, 0, 1, 0]",0,0,0,1,0
6,"‘Surgery Center 2645 Glass Rd, Ste 205 Weatla...",UB,"[0, 0, 0, 1, 0]",0,0,0,1,0
7,p804 RED lames K Smith Dr 2 —I u— 2022 WILS...,UB,"[0, 0, 0, 1, 0]",0,0,0,1,0
8,"mse — ”Pm who, +1445,“me 0,: (Wm? Lime 0? NOT...",HCFA,"[1, 0, 0, 0, 0]",1,0,0,0,0
9,EVE El . HEALTHINSURANGEDLAINFORM .APPRDveD ...,HCFA,"[1, 0, 0, 0, 0]",1,0,0,0,0


In [11]:
df_master=pd.concat([row_label,df_file], axis=1)
df_master.columns=['File_name','text','label', 'newlabel','HCFA','HCFASB','MEDICARE','UB','USB']
df_master

Unnamed: 0,File_name,text,label,newlabel,HCFA,HCFASB,MEDICARE,UB,USB
0,../UB/UB2.tiff,I. “V canto cm»: —— “I Wu gm. mam—- A will am...,UB,"[0, 0, 0, 1, 0]",0,0,0,1,0
1,../UB/UB1.tiff,p804 RED lames K Smith Dr 2 —I u— 2022 WILS...,UB,"[0, 0, 0, 1, 0]",0,0,0,1,0
2,../UB/UB7.tiff,"“‘LIO‘. Q 1“ 0N~ ION» 0"" ""~ “.4 01M. “UV-lﬁm‘...",UB,"[0, 0, 0, 1, 0]",0,0,0,1,0
3,../UB/UB8.tiff,10687127 31‘?“ ‘ ASDFASDF I COLLEGE HOSP...,UB,"[0, 0, 0, 1, 0]",0,0,0,1,0
4,../UB/UB3.tiff,Sample of a Correctly Aligned UB-04 Claim Form...,UB,"[0, 0, 0, 1, 0]",0,0,0,1,0
5,../UB/UB5.tiff,2 James K Smith Dr 123 W N BEAR CREEK DR LIVE...,UB,"[0, 0, 0, 1, 0]",0,0,0,1,0
6,../UB/UB6.tiff,"‘Surgery Center 2645 Glass Rd, Ste 205 Weatla...",UB,"[0, 0, 0, 1, 0]",0,0,0,1,0
7,../UB/UB4.tiff,p804 RED lames K Smith Dr 2 —I u— 2022 WILS...,UB,"[0, 0, 0, 1, 0]",0,0,0,1,0
8,../HCFA/HCFA19.tiff,"mse — ”Pm who, +1445,“me 0,: (Wm? Lime 0? NOT...",HCFA,"[1, 0, 0, 0, 0]",1,0,0,0,0
9,../HCFA/HCFA11.tiff,EVE El . HEALTHINSURANGEDLAINFORM .APPRDveD ...,HCFA,"[1, 0, 0, 0, 0]",1,0,0,0,0


In [12]:
df_master.to_csv('/home/affine/Downloads/Deep_Learning/demo/demo/FSL/Document Classification/df_master.csv', index=False)

In [13]:
df_master1=df_master

In [14]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from sklearn.model_selection import train_test_split

In [15]:
#Creating word embeddings

EMBEDDING_FILE='/home/affine/Downloads/Deep_Learning/demo/demo/Toxic_Comment/glove.6B/glove.6B.300d.txt'

embed_size = 300 # how big is each word vector
max_features = 813 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use
# list_classes = ['HCFA','HCFASB','MEDICARE','UB','USB']
list_classes = ['HCFA','MEDICARE','UB']

#Read in our data and replace missing values
train_X,test_X,train_Y,test_Y=train_test_split(df_master['text'],df_master[list_classes], test_size=0.33, random_state=42)

list_sentences_train = train_X.fillna("_na_").values

list_sentences_test = test_X.fillna("_na_").values



In [16]:
y=train_Y.values
y
test_Y.values

array([[1, 0, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0]], dtype=object)

In [17]:
#Preprocessing
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)


In [18]:
#Read the glove word vectors (space delimited strings) into a dictionary from word->vector.
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))


In [19]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

(-0.0039050116, 0.38177028)

In [20]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector


In [21]:
print(embedding_matrix.shape)

(813, 300)


In [37]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(300, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(300, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(3, activation="softmax")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [38]:
model.fit(X_t, y, batch_size=1, epochs=10, validation_split=0.1);


Train on 14 samples, validate on 2 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [39]:
y_test = model.predict([X_te], batch_size=1, verbose=1)



In [40]:
y_test
y_pred=np.array(y_test)
y_pred

array([[  9.96322632e-01,   1.50800112e-03,   2.16935435e-03],
       [  4.03965265e-01,   1.07690657e-03,   5.94957769e-01],
       [  5.09356201e-01,   1.01375650e-03,   4.89629984e-01],
       [  4.58764173e-02,   7.65576633e-03,   9.46467876e-01],
       [  2.16190126e-02,   1.05222007e-02,   9.67858732e-01],
       [  9.97212708e-01,   1.01503846e-03,   1.77217158e-03],
       [  2.19889119e-01,   1.42317079e-03,   7.78687716e-01],
       [  3.23716464e-04,   6.70916343e-05,   9.99609172e-01],
       [  5.81091106e-01,   1.50887074e-03,   4.17400062e-01]], dtype=float32)

In [41]:
y_actual=test_Y.values
y_actual

array([[1, 0, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0]], dtype=object)

In [42]:
# y_act_non_cat=[np.argmax(t) for t in y_act]
# y_pred_non_cat=[np.argmax(t) for t in y_pred]


In [43]:
# from sklearn.metrics import confusion_matrix
# confusion_mat=confusion_matrix(y_act_non_cat,y_pred_non_cat, labels=['HCFA','HCFASB','MEDICARE','UB','USB'])

In [44]:
# n_classes=5
# y_test1=np.array(y_test)
# y_test_cat=np_utils.to_categorical(y_test1,n_classes)
# y_pred=y_test_cat.argmax(1)
# y_pred

In [45]:
y_pred1=y_pred.argmax(1)
y_pred1

array([0, 2, 0, 2, 2, 0, 2, 2, 0])

In [46]:
# y_act0=test_Y.values

# y_act1=np.array(y_act)
# y_act_cat=np_utils.to_categorical(y_act1,n_classes)
# y_act=y_act_cat.argmax(1)

# y_act

In [47]:
y_act=y_actual.argmax(1)
y_act

array([0, 0, 2, 1, 0, 0, 0, 2, 1])

In [48]:
from sklearn.metrics import confusion_matrix
confusion_mat=confusion_matrix(y_act,y_pred1)#, labels=['HCFA','HCFASB','MEDICARE','UB','USB'])

In [49]:
print(confusion_mat)

[[2 0 3]
 [1 0 1]
 [1 0 1]]
