# Training a CV model for correct first page prediction

This notebook covers one of the approaches to training a CV model for predicting whether a page of the document is the first one or not -- a feature that would allow correct splitting for PDFs that consist of more than one actual document (we assume that the pages are already sorted).

Before you start, make sure you have **installed** and **initialized** the konfuzio_sdk package as shown in the readme of the [repository](https://github.com/konfuzio-ai/Python-SDK).

In [None]:
!pip install konfuzio-sdk

In [None]:
!konfuzio_sdk init

Importing necessary libraries and packages:

In [34]:
import cv2   
import keras
import os
 
import numpy as np
import tensorflow as tf

from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import Sequential, load_model
from keras.layers import Dense, Conv2D, MaxPool2D , Flatten
from keras.preprocessing import image
from keras.preprocessing.image import ImageDataGenerator
from konfuzio_sdk.data import Project, Document
from tensorflow.keras.optimizers import Adam
from tqdm import tqdm

Setting seed for reproducibility purposes:

In [2]:
seed_value = 42
os.environ['PYTHONHASHSEED'] = str(seed_value)

### Gathering and processing the data

In [6]:
my_project = Project(id_=95)

In [7]:
train_docs = my_project.documents
test_docs = my_project.test_documents

In [8]:
for doc in train_docs:
    doc.get_images()

In [9]:
doc_paths = []

for el in os.listdir('data_95/documents/'):
    doc_paths.append('data_95/documents/' + el)

In [14]:
page_paths = []
labels = []

for path in doc_paths:
    try:
        for el in os.listdir(path + '/'):
            if el.split('.')[-1] == 'png':
                page_paths.append(path + '/' + el)
                if el.split('.')[-2] == 'page_1':
                    labels.append(1)
                else:
                    labels.append(0)
    except NotADirectoryError:
        pass

Processing images from training and test sets with Otsu binarization and resizing:

In [15]:
for img in tqdm(page_paths):
    image = cv2.imread(img)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    ret, thresh1 = cv2.threshold(image, 120, 255, cv2.THRESH_BINARY + 
                                                cv2.THRESH_OTSU)
    image = cv2.resize(thresh1, (224,224), interpolation=cv2.INTER_AREA)
    cv2.imwrite('otsued/train/{}'.format(img.split('/')[-2] + '_' +
                                        img.split('/')[-1]), image)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 881/881 [00:13<00:00, 64.52it/s]


In [22]:
for doc in test_docs:
    doc.get_images()

In [29]:
test_paths = []

for el in os.listdir('data_95/documents/'):
    try:
        if int(el) >= 7065 and int(el) <= 7127 :
            test_paths.append('data_95/documents/' + el)
    except ValueError:
        pass

In [31]:
page_paths = []
labels = []

for path in test_paths:
    for el in os.listdir(path + '/'):
        if el.split('.')[-1] == 'png':
            page_paths.append(path + '/' + el)
            if el.split('.')[-2] == 'page_1':
                labels.append(1)
            else:
                labels.append(0)

In [33]:
for img in tqdm(page_paths):
    image = cv2.imread(img)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    ret, thresh1 = cv2.threshold(image, 120, 255, cv2.THRESH_BINARY + 
                                                cv2.THRESH_OTSU)
    image = cv2.resize(thresh1, (224,224), interpolation=cv2.INTER_AREA)
    cv2.imwrite('otsued/test/{}'.format(img.split('/')[-2] + '_' +
                                        img.split('/')[-1]), image)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 235/235 [00:04<00:00, 55.85it/s]


Loading processed and sorted images:

In [2]:
trdata = ImageDataGenerator()
traindata = trdata.flow_from_directory(directory="drive/MyDrive/train",target_size=(224,224))
tsdata = ImageDataGenerator()
testdata = tsdata.flow_from_directory(directory="drive/MyDrive/test", target_size=(224,224))

Found 881 images belonging to 2 classes.
Found 235 images belonging to 2 classes.


Building VGG16 architecture:

In [None]:
model = Sequential()
model.add(Conv2D(input_shape=(224,224,3),filters=64,kernel_size=(3,3),padding="same", activation="relu"))
model.add(Conv2D(filters=64,kernel_size=(3,3),padding="same", activation="relu"))
model.add(MaxPool2D(pool_size=(2,2),strides=(2,2)))
model.add(Conv2D(filters=128, kernel_size=(3,3), padding="same", activation="relu"))
model.add(Conv2D(filters=128, kernel_size=(3,3), padding="same", activation="relu"))
model.add(MaxPool2D(pool_size=(2,2),strides=(2,2)))
model.add(Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu"))
model.add(Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu"))
model.add(Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu"))
model.add(MaxPool2D(pool_size=(2,2),strides=(2,2)))
model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
model.add(MaxPool2D(pool_size=(2,2),strides=(2,2)))
model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
model.add(MaxPool2D(pool_size=(2,2),strides=(2,2)))
model.add(Flatten())
model.add(Dense(units=4096,activation="relu"))
model.add(Dense(units=4096,activation="relu"))
model.add(Dense(units=2, activation="softmax"))

In [None]:
opt = Adam(lr=0.001)
model.compile(optimizer=opt, loss=keras.losses.categorical_crossentropy, metrics=['accuracy'])

In [7]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 224, 224, 64)      1792      
                                                                 
 conv2d_1 (Conv2D)           (None, 224, 224, 64)      36928     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 112, 112, 64)     0         
 )                                                               
                                                                 
 conv2d_2 (Conv2D)           (None, 112, 112, 128)     73856     
                                                                 
 conv2d_3 (Conv2D)           (None, 112, 112, 128)     147584    
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 56, 56, 128)      0         
 2D)                                                    

Training the model with 100 epochs:

In [9]:
checkpoint = ModelCheckpoint("vgg16_1.h5", monitor='val_accuracy', verbose=1, 
                             save_best_only=True, save_weights_only=False, mode='auto', period=1)
early = EarlyStopping(monitor='val_accuracy', min_delta=0, patience=20, verbose=1, mode='auto')
hist = model.fit_generator(steps_per_epoch=100,generator=traindata, validation_data=testdata, 
                           validation_steps=10,epochs=100,callbacks=[checkpoint,early])



Epoch 1/100


  """







Epoch 1: val_accuracy improved from -inf to 0.73191, saving model to vgg16_1.h5


# Metrics & prediction

In [None]:
saved_model = load_model("drive/MyDrive/model/vgg16_1.h5")

In [37]:
def calculate_metrics(paths, model):
    true_positive = 0
    false_positive = 0
    false_negative = 0
    
    for path in tqdm(paths):
        if 'not_first_page' in path:
            label = 0
        else:
            label = 1

        img = image.load_img(path,target_size=(224,224))
        img = np.asarray(img)
        img = np.expand_dims(img, axis=0)
        output = saved_model.predict(img)

        if output[0][0] > output[0][1]:
            pred = 0
        else:
            pred = 1

        if label == 1 and pred == 1:
            true_positive += 1
        elif label == 1 and pred == 0:
            false_negative += 1
        elif label == 0 and pred == 1:
            false_positive += 1
    
    if true_positive + false_positive != 0:
        precision = true_positive / (true_positive + false_positive)
    else:
        precision = 0
    
    if true_positive + false_negative != 0:
        recall = true_positive / (true_positive + false_negative)
    else:
        recall = 0
    
    if precision + recall != 0:
        f1 = 2 * precision * recall / (precision + recall)
    
    else:
        f1 = 0
    
    return precision, recall, f1

In [21]:
paths = []

for el in os.listdir('drive/MyDrive/test/first_page'):
    paths.append('drive/MyDrive/test/first_page/' + el)

for el in os.listdir('drive/MyDrive/test/not_first_page'):
    paths.append('drive/MyDrive/test/not_first_page/' + el)


In [24]:
precision, recall, f1 = calculate_metrics(paths, saved_model)

100%|██████████| 235/235 [00:14<00:00, 16.18it/s]


In [25]:
print('\n Precision: {} \n Recall: {} \n F1 score: {}'.format(precision, recall, f1))


 Precision: 0.2680851063829787 
 Recall: 1.0 
 F1 score: 0.4228187919463087
