# Project description

The aim of this project is to transcript scanned images into text using Deep Learning and Computer Vision techniques.

The dataset that I used comes from the following website.
You need to create an account to have access to it.

The dataset consists of forms, lines and words scanned images. Here, we will only focus on the words images and their labels. These are available in the xml files.

Like all data science project, we need to explore and clean our data before applying any DL model on it.
In this project, I already preprocessed the data and I focused only on the 100 most frequent words in the dataset.

For this project, I decided to run a classification problem using a CNN.

With the test set, I obtained an accuracy of 85% but keep in mind that I hugely reduced my dataset. So it could be interesting to see how the model evolves by using all the dataset.

# Import packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Conv2D, MaxPooling2D 
from keras.utils import np_utils
from tensorflow.keras import callbacks

# Import the data

In [None]:
#Read the csv file in a dataframe

data = pd.read_csv('.../top100.csv')
data.head()

In [None]:
#Read all the images in a numpy array
#Here I put all png files in one folder 'raw'

path_images = '.../data/raw'

import cv2

X = []
for img_id in data["image_id"]:
    my_image = cv2.imread(path_images + '/' + img_id + '.png', cv2.IMREAD_GRAYSCALE)
    my_image = cv2.resize(my_image, dsize = (65,65), interpolation = cv2.INTER_LINEAR)
    X.append(my_image)

In [None]:
#Reshape inputs and labels so that they can be used for our model

X = np.array(X)
X = X.reshape([-1,65,65,1])

y = data["text"]
y = np.array(y)
y = y.reshape(-1,1)

In [None]:
#Attribute a class for each label

from sklearn import preprocessing

enc = preprocessing.OrdinalEncoder(categories='auto')

enc.fit(y)

target = enc.transform(y)

In [None]:
#Create training and test sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, target, test_size = 0.2)

y_train = np_utils.to_categorical(y_train, dtype = "int")
y_test = np_utils.to_categorical(y_test, dtype = "int")

# Build the CNN model

In [None]:
model = Sequential()

first_layer = Conv2D(filters = 256, kernel_size = (5,5), input_shape = (65,65,1),activation = "relu")
second_layer = MaxPooling2D(pool_size = (2,2))

third_layer = Conv2D(filters = 128, kernel_size = (3,3), activation = "relu")
fourth_layer = MaxPooling2D(pool_size = (2,2))

fifth_layer = Conv2D(filters = 64, kernel_size = (3,3), activation = "relu")
sixth_layer = MaxPooling2D(pool_size = (2,2))

seventh_layer = Dropout(rate = 0.2)
eighth_layer = Flatten()
nineth_layer = Dense(units = 128, activation = "relu")
tenth_layer = Dense(units = y_train.shape[1], activation = "softmax")

model.add(first_layer)
model.add(second_layer)
model.add(third_layer)
model.add(fourth_layer)
model.add(fifth_layer)
model.add(sixth_layer)
model.add(seventh_layer)
model.add(eighth_layer)
model.add(nineth_layer)
model.add(tenth_layer)

model.summary()

# Compile and fit the model

In [None]:
#Use callbacks if needed
import os
cwd = os.getcwd()
filepath = cwd

TON = callbacks.TerminateOnNaN()
early_stopping = callbacks.EarlyStopping(monitor = "val_loss", patience = 3, mode = "min", restore_best_weights = True)
checkpoint = callbacks.ModelCheckpoint(filepath = filepath +'/', monitor = "val_loss", save_best_only = True, save_weights_only = True, mode = "min", save_freq = "epoch")

In [None]:
model.compile(loss = "categorical_crossentropy" , optimizer = "adam", metrics = ["accuracy"])

In [None]:
training_history = model.fit(X_train, y_train, epochs = 5, validation_data = (X_test, y_test), callbacks = [TON, early_stopping, checkpoint])

In [None]:
#Save the model
model.save('ocr_model_cnn.h5')

# Plot losses curves

In [None]:
train_acc = training_history.history['accuracy']
val_acc = training_history.history['val_accuracy']

In [None]:
plt.xlabel("epochs")
plt.ylabel("accuracy")

plt.plot(np.arange(1,6,1), train_acc, label = "training accuracy", color = "blue")
plt.plot(np.arange(1,6,1), val_acc, label = "validation accuracy", color = "green")
plt.legend()
plt.show();

In [None]:
train_loss = training_history.history['loss']
val_loss = training_history.history['val_loss']

In [None]:
plt.xlabel("epochs")
plt.ylabel("accuracy")

plt.plot(np.arange(1,6,1), train_loss, label = "training loss", color = "red")
plt.plot(np.arange(1,6,1), val_loss, label = "validation loss", color = "yellow")
plt.legend()
plt.show();

# Make predictions

In [None]:
test_pred = model.predict(X_test/1.0)

In [None]:
test_pred_class = test_pred.argmax(axis = 1)
y_test_class = y_test.argmax(axis = 1)

In [None]:
#Plot some results

target_new = target.reshape([-1])
y_new = y.reshape([-1])

j = 1
for i in np.random.choice(len(test_pred), size = 3):
    img = X_test[i] 
      
    index_test = list(target_new).index(y_test_class[i])
    index_pred = list(target_new).index(test_pred_class[i])
    
    plt.subplot(1, 3, j)
    j = j + 1
    plt.axis('off')
    plt.imshow(img, cmap="gray", interpolation='None')
    plt.title('True Label: ' + str(y_new[index_test]) \
              + '\n' + 'Prediction: '+ str(y_new[index_pred]))

# Evaluate the model

In [None]:
results = model.evaluate(X_test, y_test)

In [None]:
#Print the confusion matrix to see where the model got confused

from sklearn.metrics import confusion_matrix

matrix = confusion_matrix(test_pred_class,y_test_class)
matrix

In [None]:
from sklearn.metrics import classification_report

report = classification_report(y_test_class, y_test_class)
print(report)