# Image Classification of Documents

## 1. Setup



### 1.1 Getting dataset into colab
After running split_data.ipynb (need to change it to a .py file), compress the data folder to a .zip file which would reduce the size by more than 10 and call it data.zip, then upload the compressed file to google drive. After importing the file, use !unzip to uncompress


Use one of the colab code snippet to get the list of all .zip files and use another to import the file. 


In [1]:
# Install the PyDrive wrapper & import libraries.
# This only needs to be done once per notebook.
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# List .txt files in the root.
#
# Search query reference:
# https://developers.google.com/drive/v2/web/search-parameters
listed = drive.ListFile({'q': "title contains '.zip'"}).GetList()
for file in listed:
  print('title {}, id {}'.format(file['title'], file['id']))

[?25l[K    1% |▎                               | 10kB 21.1MB/s eta 0:00:01[K    2% |▋                               | 20kB 1.7MB/s eta 0:00:01[K    3% |█                               | 30kB 2.6MB/s eta 0:00:01[K    4% |█▎                              | 40kB 3.3MB/s eta 0:00:01[K    5% |█▋                              | 51kB 2.1MB/s eta 0:00:01[K    6% |██                              | 61kB 2.5MB/s eta 0:00:01[K    7% |██▎                             | 71kB 2.9MB/s eta 0:00:01[K    8% |██▋                             | 81kB 3.3MB/s eta 0:00:01[K    9% |███                             | 92kB 2.5MB/s eta 0:00:01[K    10% |███▎                            | 102kB 2.7MB/s eta 0:00:01[K    11% |███▋                            | 112kB 2.8MB/s eta 0:00:01[K    12% |████                            | 122kB 3.9MB/s eta 0:00:01[K    13% |████▎                           | 133kB 3.9MB/s eta 0:00:01[K    14% |████▋                           | 143kB 3.9MB/s eta 0:00:01[

In [0]:
# Install the PyDrive wrapper & import libraries.
# This only needs to be done once per notebook.
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Download a file based on its file ID.
#
# A file ID looks like: laggVyWshwcyP6kEI-y_W3P8D26sz
file_id = '1pMEgJWXyKopKo5OQTSpG1pTK8XFj7Cg9'
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('data.zip')


In [0]:
!unzip data.zip

### Now restart the kernel by choosing Kernel > Restart.

In [0]:
# if you wanna test the code
# import os
# import matplotlib.pyplot as plt
# import matplotlib.image as mpimg
# import numpy as np

# os.chdir("data/test/Advertisement")

# img=mpimg.imread('2067311301_2067311302.tif')
# imgplot = plt.imshow(img)
# os.chdir("../../../")

In [6]:
!ls

adc.json  data	data.zip  sample_data


### 1.2 Import packages and libraries
Import the packages and libraries that you'll use:

In [0]:
import os, random
import numpy as np
import pandas as pd
import PIL
import keras
import itertools
from PIL import Image

import matplotlib.pyplot as plt
from matplotlib import ticker
import seaborn as sns
%matplotlib inline 

from keras import backend as K
from keras.models import Sequential
from keras import applications
from keras import optimizers
from keras.models import Model
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.layers import Input, Dropout, Flatten, Conv2D, MaxPooling2D, Dense, Activation
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping



### 2.2 Global Variables 
Enter the batch size for training, testing and validation dataset

In [0]:
batch_size_train = 20
batch_size_val = 10
batch_size_test = 10
num_classes= 5
intereseted_folder='data'
STANDARD_SIZE=(224,224)

# 4. Classification

## 4.1 Create the Datset

In [0]:
'''Creating data generators
'''
datagen=keras.preprocessing.image.ImageDataGenerator(data_format=K.image_data_format())

train_datagen = ImageDataGenerator(
horizontal_flip = True,
fill_mode = "nearest",
zoom_range = 0.3,
width_shift_range = 0.3,
height_shift_range=0.3,
rotation_range=30)

test_datagen = ImageDataGenerator(
horizontal_flip = True,
fill_mode = "nearest",
zoom_range = 0.3,
width_shift_range = 0.3,
height_shift_range=0.3,
rotation_range=30)

In [40]:
'''Input the Training Data
'''
train_path = "data/train/"
train_batches = ImageDataGenerator().flow_from_directory(train_path, target_size=(224,224), batch_size=batch_size_train)

Found 2167 images belonging to 8 classes.


In [41]:
'''Input the Validation Data
'''

val_path = "data/val/"
val_batches = ImageDataGenerator().flow_from_directory(val_path, target_size=(224,224), batch_size=batch_size_val)


Found 441 images belonging to 8 classes.


In [42]:
'''Input the Test Data
'''
test_path = 'data/test/'
test_batches = ImageDataGenerator().flow_from_directory(test_path, target_size=(224,224), batch_size=batch_size_test)


Found 465 images belonging to 8 classes.


In [15]:
test_imgs, test_labels = next(test_batches)
test_labels

array([[0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 

In [0]:
y_test= [ np.where(r==1)[0][0] for r in test_labels ]
y_test

[1, 7, 3, 7, 0, 3, 6, 7, 1, 1, 5, 1, 4, 4, 1, 2, 1, 3, 7, 1, 4, 1, 4, 3, 3]

## 4.2 Build the Model

In [0]:
model = applications.VGG19(weights = "imagenet", include_top=False, input_shape = (224, 224, 3))

In [59]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

In [0]:
# freezing layers
for layer in model.layers:
  layer.trainable = False

In [0]:
x = model.output
x = Flatten()(x)
x = Dense(512, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(256, activation="relu")(x)
x = Dense(64, activation="relu")(x)
predictions = Dense(8, activation="softmax")(x)

In [62]:
final_model=Model(inputs=model.input,outputs=predictions)

final_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

In [0]:
# compile the model 
final_model.compile(loss = "categorical_crossentropy", optimizer = optimizers.SGD(lr=0.0001, momentum=0.1), metrics=["accuracy"])

In [0]:
# Save the model according to the conditions  
checkpoint = ModelCheckpoint("vgg16_1.h5", monitor='val_acc', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1)
early = EarlyStopping(monitor='val_acc', min_delta=0, patience=10, verbose=1, mode='auto')

## 4.3 Train the Model

The model will take about 30-45 minutes to train. 

In [65]:

#steps per epoch number of samples/batch size
final_model.fit_generator(
train_batches,
steps_per_epoch=109,
epochs = 5,
validation_data = val_batches,
validation_steps=45,
callbacks = [checkpoint, early],
verbose=1)

Epoch 1/5

Epoch 00001: val_acc improved from -inf to 0.57370, saving model to vgg16_1.h5
Epoch 2/5

Epoch 00002: val_acc improved from 0.57370 to 0.66667, saving model to vgg16_1.h5
Epoch 3/5

Epoch 00003: val_acc improved from 0.66667 to 0.70068, saving model to vgg16_1.h5
Epoch 4/5

Epoch 00004: val_acc improved from 0.70068 to 0.71429, saving model to vgg16_1.h5
Epoch 5/5

Epoch 00005: val_acc improved from 0.71429 to 0.73469, saving model to vgg16_1.h5


<keras.callbacks.History at 0x7f71b1bc3048>

## 4.4 Test the Model with External Test Images

In [0]:

# Your data file was loaded into a botocore.response.StreamingBody object.
# Please read the documentation of ibm_boto3 and pandas to learn more about your possibilities to load the data.
# ibm_boto3 documentation: https://ibm.github.io/ibm-cos-sdk-python/
# pandas documentation: http://pandas.pydata.org/
streaming_body_2 = client_8e7d0cae69cd45ec8430f0389e234d95.get_object(Bucket='imagerecognitionpattern-donotdelete-pr-7whfpase0vr47w', Key='test_doc-external.zip')['Body']
# add missing __iter__ method so pandas accepts body as file-like object
if not hasattr(streaming_body_2, "__iter__"): streaming_body_2.__iter__ = types.MethodType( __iter__, streaming_body_2 ) 



In [0]:
#model.save_weights('my_model_weights.h5')
#model.load_weights('my_model_weights.h5')

In [0]:
from io import BytesIO
import zipfile

zip_ref = zipfile.ZipFile(BytesIO(streaming_body_2.read()),'r')
paths = zip_ref.namelist()
del paths[0]
print(paths)
for path in paths:
    print(zip_ref.extract(path))
zip_ref.close()

['test_doc-external/Form1 copy 2.jpg', 'test_doc-external/pan copy 2.jpg', 'test_doc-external/cheque copy 2.jpg', 'test_doc-external/driving_license_2 copy 2.jpg', 'test_doc-external/passport copy 2.jpg']
/home/dsxuser/work/test_doc-external/Form1 copy 2.jpg
/home/dsxuser/work/test_doc-external/pan copy 2.jpg
/home/dsxuser/work/test_doc-external/cheque copy 2.jpg
/home/dsxuser/work/test_doc-external/driving_license_2 copy 2.jpg
/home/dsxuser/work/test_doc-external/passport copy 2.jpg


In [0]:
X_test=[]
def convert_to_image(X):
    '''Function to convert all Input Images to the STANDARD_SIZE and create Training Dataset
    '''
    for f in paths:
        #fobj=get_file(f)
        #print(type(fobj))predictions= model.predict(X_test)
        if os.path.isdir(f):
            continue
        img= PIL.Image.open(f)
        img = img.resize(STANDARD_SIZE)
        img=np.array(img)
        X.append(img)
        #print(X_train)
    #print(len(X_train))
    return X
X_test=np.array(convert_to_image(X_test))
datagen.fit(X_test)

In [0]:
predictions= model.predict(X_test)
predictions

array([[ 0.47580448,  0.47760704,  0.51829034,  0.50596583,  0.47304007],
       [ 0.47949249,  0.4817391 ,  0.47868609,  0.51169461,  0.49393257],
       [ 0.51717132,  0.4783558 ,  0.47287402,  0.50082076,  0.48920417],
       [ 0.50053394,  0.47719797,  0.47839782,  0.49702418,  0.51833802],
       [ 0.4910911 ,  0.52186197,  0.5108099 ,  0.48280653,  0.47803456]], dtype=float32)

In [0]:
y_pred=[]
for i in range(len(predictions)):
    y_pred.append(np.argmax(predictions[i]))
y_pred
j = 0
for i in y_pred:
    print(paths[y_pred[j]])
    j = j + 1

test_doc-external/cheque copy 2.jpg
test_doc-external/driving_license_2 copy 2.jpg
test_doc-external/Form1 copy 2.jpg
test_doc-external/passport copy 2.jpg
test_doc-external/pan copy 2.jpg


In [0]:
#print(classes_required)
index= classes_required.index('Documents')
for i in range(len(y_pred)):
    if y_pred[i] == index:
        print("Image classified as a form document: ", paths[i])

Image classified as a form document:  test_doc-external/Form1 copy 2.jpg


## 4.5 Accuracy Testing

In [0]:
predictions = model.predict_generator(test_batches, steps=1, verbose=0)
predictions

array([[ 0.51717132,  0.4783558 ,  0.47287402,  0.50082076,  0.48920417],
       [ 0.50053394,  0.47719797,  0.47839782,  0.49702418,  0.51833802],
       [ 0.47949249,  0.4817391 ,  0.47868609,  0.51169461,  0.49393257],
       [ 0.4910911 ,  0.52186197,  0.5108099 ,  0.48280653,  0.47803456],
       [ 0.47580448,  0.47760704,  0.51829034,  0.50596583,  0.47304007]], dtype=float32)

In [0]:
predictions
y_pred=[]
for i in range(len(predictions)):
    y_pred.append(np.argmax(predictions[i]))
print(y_pred)
#plots(test_imgs, titles=y_pred)

ctr=0
for i in range(len(y_pred)):
    if y_pred[i] == y_test[i]:
        ctr=ctr+1
res = ctr/len(y_pred)*100
print(res)


[0, 4, 3, 1, 2]
100.0
