In [None]:
# Link code: https://www.kaggle.com/code/aslanahmedov/automatic-number-plate-recognition

In [1]:
%pip install pytesseract

Collecting pytesseract
  Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.10


In [2]:
import os
import cv2
import numpy as np
import pandas as pd
import tensorflow as tf
import pytesseract as pt
import plotly.express as px
import matplotlib.pyplot as plt
import xml.etree.ElementTree as xet

from glob import glob
from skimage import io
from shutil import copy
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import TensorBoard
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications import InceptionResNetV2
from tensorflow.keras.layers import Dense, Dropout, Flatten, Input
from tensorflow.keras.preprocessing.image import load_img, img_to_array


In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
%cd /content/gdrive/MyDrive/Colab Notebooks/ML/single_plate

/content/gdrive/MyDrive/Colab Notebooks/ML/single_plate


In [5]:
path = glob('./xml_folder/*.xml')
labels_dict = dict(filepath=[],xmin=[],xmax=[],ymin=[],ymax=[])
for filename in path:

    info = xet.parse(filename)
    root = info.getroot()
    member_object = root.find('object')
    labels_info = member_object.find('bndbox')
    xmin = int(labels_info.find('xmin').text)
    xmax = int(labels_info.find('xmax').text)
    ymin = int(labels_info.find('ymin').text)
    ymax = int(labels_info.find('ymax').text)

    labels_dict['filepath'].append(filename)
    labels_dict['xmin'].append(xmin)
    labels_dict['xmax'].append(xmax)
    labels_dict['ymin'].append(ymin)
    labels_dict['ymax'].append(ymax)

In the above code, we individually take each file and parse into xml.etree and find the object -> bndbox. Then we extract xmin,xmax,ymin,ymax and saved those values in the dictionary. After we convert it into a pandas data frame and save that into CSV file and save it in project folder as shown below.

In [6]:
df = pd.DataFrame(labels_dict)
df.to_csv('labels.csv',index=False)
df.head()

Unnamed: 0,filepath,xmin,xmax,ymin,ymax
0,./xml_folder/S358.xml,1582,3062,1033,2105
1,./xml_folder/S363.xml,1468,3010,886,2155
2,./xml_folder/S364.xml,1582,3071,822,1864
3,./xml_folder/S360.xml,1465,3087,875,1900
4,./xml_folder/S355.xml,1140,3168,422,1855


With the above code, we successfully extract the diagonal position of each image and convert the data from an unstructured to a structured format.You can have A look data above. Now also extract the respective image filename of the XML.

In [7]:
filename = df['filepath'][0]
def getFilename(filename):
    filename_image = xet.parse(filename).getroot().find('filename').text
    filepath_image = os.path.join('./img_folder',filename_image)
    return filepath_image
getFilename(filename)

'./img_folder/S358.jpg'

In [9]:
image_path = list(df['filepath'].apply(getFilename))
image_path[:10]

['./img_folder/S358.jpg',
 './img_folder/S363.jpg',
 './img_folder/S364.jpg',
 './img_folder/S360.jpg',
 './img_folder/S355.jpg',
 './img_folder/S341.jpg',
 './img_folder/S353.jpg',
 './img_folder/S340.jpg',
 './img_folder/S346.jpg',
 './img_folder/S349.jpg']

<p id="part10"></p>

# <span style="font-family: Arials; font-size: 16px; font-style: normal; font-weight: bold; letter-spacing: 3px; text-align: center; color: #000000; line-height:1.0"> VERIFY THE DATA</span>

As till now we did the manual process it is important to verify the information is we got is valid or not. For that just verify the bounding box is appearing properly for a given image. Here I consider the image N2.jpeg and the corresponding diagonal position can found in df. Result you can see on *Figure 8*

In [10]:
file_path = image_path[87] #path of our image N2.jpeg
img = cv2.imread(file_path) #read the image
fig = px.imshow(img)
fig.update_layout(width=600, height=500, margin=dict(l=10, r=10, b=10, t=10),xaxis_title='Figure 8 - N2.jpeg with bounding box')
fig.add_shape(type='rect',x0=df['xmin'][87], x1=df['xmax'][87], y0=df['ymin'][87], y1=df['ymax'][87], xref='x', yref='y',line_color='cyan')

Output hidden; open in https://colab.research.google.com to view.

<p id="part11"></p>

# <span style="font-family: Arials; font-size: 20px; font-style: normal; font-weight: bold; letter-spacing: 3px; text-align: center; color: #000000; line-height:1.0">DATA PROCESSING</span>

<p id="part12"></p>

# <span style="font-family: Arials; font-size: 16px; font-style: normal; font-weight: bold; letter-spacing: 3px; text-align: center; color: #000000; line-height:1.0">READ DATA</span>

This is a very important step, in this process we will take each and every image and convert it into an array using OpenCV and resize the image into 224 x 224 which is the standard compatible size of the pre-trained transfer learning model.

In [11]:
#Targeting all our values in array selecting all columns
labels = df.iloc[:,1:].values
data = []
output = []
for ind in range(len(image_path)):
    image = image_path[ind]
    img_arr = cv2.imread(image)
    h,w,d = img_arr.shape
    # Prepprocesing
    load_image = load_img(image,target_size=(224,224))
    load_image_arr = img_to_array(load_image)
    norm_load_image_arr = load_image_arr/255.0 # Normalization
    # Normalization to labels
    xmin,xmax,ymin,ymax = labels[ind]
    nxmin,nxmax = xmin/w,xmax/w
    nymin,nymax = ymin/h,ymax/h
    label_norm = (nxmin,nxmax,nymin,nymax) # Normalized output
    # Append
    data.append(norm_load_image_arr)
    output.append(label_norm)

After that, we will normalize the image just by dividing with maximum number as we know that the maximum number for an 8-bit image is 28 -1 = 255. That the reason we will divide our image 255.0. The way of diving an array with the maximum value is called Normalization (Min-Max Scaler). We also need to normalize our labels too. Because for the deep learning model the output range should be between 0 to 1. For normalizing labels, we need to divide the diagonal points with the width and height of the image. And finally values in a python list.
<p id="part13"></p>

# <span style="font-family: Arials; font-size: 16px; font-style: normal; font-weight: bold; letter-spacing: 3px; text-align: center; color: #000000; line-height:1.0">SPLIT TRAIN AND TEST SET</span>
In the next step, we will convert the list into an array using __Numpy__.

In [12]:
# Convert data to array
X = np.array(data,dtype=np.float32)
y = np.array(output,dtype=np.float32)

Now split the data into training and testing set using __sklearn__.

In [13]:
# Split the data into training and testing set using sklearn.
x_train,x_test,y_train,y_test = train_test_split(X,y,train_size=0.8,random_state=0)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((294, 224, 224, 3), (74, 224, 224, 3), (294, 4), (74, 4))

<p id="part14"></p>

# <span style="font-family: Arials; font-size: 20px; font-style: normal; font-weight: bold; letter-spacing: 3px; text-align: center; color: #000000; line-height:1.0">DEEP LEARNING FOR OBJECT DETECTION </span>

<p id="part15"></p>

# <span style="font-family: Arials; font-size: 16px; font-style: normal; font-weight: bold; letter-spacing: 3px; text-align: center; color: #000000; line-height:1.0">INCEPTION-RESNET-V2 MODEL BUILDING</span>

Inception-ResNet-v2 is a convolutional neural network that is trained on more than a million images from the ImageNet database. The network is 164 layers deep and can classify images into 1000 object categories, such as keyboard, mouse, pencil, and many animals. As a result, the network has learned rich feature representations for a wide range of images. The Inception-ResNet-v2 was used for the classification task. The architecture of the network is shown in Figure 9 . Inception-Resnet-v2 is formulated based on a combination of the Inception structure and the Residual connection. In the Inception-Resnet block multiple sized convolutional filters are combined by residual connections. The usage of reyfual connections not only avoids the degradation problm caused by deep structures but also reduces the training time.

<img src= "https://github.com/Asikpalysik/Automatic-License-Plate-Detection/blob/main/Presentation/Notebook7.png?raw=true" width="50%" align="center"  hspace="5%" vspace="5%"/>

We are ready to train a deep learning model for object detection. Here we will use the Inception-ResNet-v2 model with pre-trained weights and train this to our data. We are already import necessary libraries from TensorFlow previously, lets continue.


In [None]:
inception_resnet = InceptionResNetV2(weights="imagenet",include_top=False, input_tensor=Input(shape=(224,224,3)))
# ---------------------
headmodel = inception_resnet.output
headmodel = Flatten()(headmodel)
headmodel = Dense(500,activation="relu")(headmodel)
headmodel = Dense(250,activation="relu")(headmodel)
headmodel = Dense(4,activation='sigmoid')(headmodel)

# ---------- model
model = Model(inputs=inception_resnet.input,outputs=headmodel)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_resnet_v2/inception_resnet_v2_weights_tf_dim_ordering_tf_kernels_notop.h5


Now compile the model and  have a look at our summary. Don't de surprise summary will be a bit massiv. The summary is textual and includes information about: The layers and their order in the model. The output shape of each layer. The number of parameters (weights) in each layer.

In [None]:
# Complie model
model.compile(loss='mse',optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4))
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 224, 224, 3)]        0         []                            
                                                                                                  
 conv2d (Conv2D)             (None, 111, 111, 32)         864       ['input_1[0][0]']             
                                                                                                  
 batch_normalization (Batch  (None, 111, 111, 32)         96        ['conv2d[0][0]']              
 Normalization)                                                                                   
                                                                                                  
 activation (Activation)     (None, 111, 111, 32)         0         ['batch_normalization[0][0

<p id="part16"></p>

# <span style="font-family: Arials; font-size: 16px; font-style: normal; font-weight: bold; letter-spacing: 3px; text-align: center; color: #000000; line-height:1.0">INCEPTION-RESNET-V2 TRAINING AND SAVE</span>

In [None]:
tfb = TensorBoard('object_detection')
history = model.fit(x=x_train,y=y_train,batch_size=10,epochs=100,
                    validation_data=(x_test,y_test),callbacks=[tfb])

In [None]:
model.save('./object_detection.h5')

<p id="part17"></p>

# <span style="font-family: Arials; font-size: 16px; font-style: normal; font-weight: bold; letter-spacing: 3px; text-align: center; color: #000000; line-height:1.0">TENSORBOARD</span>

Lest have a look at on scalars on TensorBoard. In order to do it we will need to run simple command with right path for our "object detection". After we will see output with hosted link open it with Chrome. I was using VSCode for this project and for me it was way easy to run TensorBoard overview results, but in Kaggle it a bit more complicated and could  be disscused in other topic. For now i will show one screenshot of result which we have. We can see on scalars *Figure 12* how is our model preform. Our train and validation set don’t have over fitting behavior and our loss with respect of epochs is less.

You can simple type <code>!tensorboard --logdir="./object_detection"</code> it will generate link with text, click on link and here we go. <code>Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all TensorBoard 2.6.0 at http://localhost:6006/ (Press CTRL+C to quit)</code>

<img src= "https://github.com/Asikpalysik/Automatic-License-Plate-Detection/blob/main/Presentation/Notebook8.png?raw=true" width="80%" align="center" hspace="5%" vspace="5%"/>

<p id="part18"></p>

# <span style="font-family: Arials; font-size: 20px; font-style: normal; font-weight: bold; letter-spacing: 3px; text-align: center; color: #000000; line-height:1.0">PIPELINE OBJECT DETECTION MODEL</span>

<p id="part19"></p>

# <span style="font-family: Arials; font-size: 16px; font-style: normal; font-weight: bold; letter-spacing: 3px; text-align: center; color: #000000; line-height:1.0">MAKE PREDICTIONS</span>

This is the final step in object detection. In this step, we will put it all together and get the prediction for a given image. First, I would like to try with one of my test pictures of car. Let load our model.

In [14]:
# Load model
model = tf.keras.models.load_model('./object_detection.h5')
print('Model loaded Sucessfully')

Model loaded Sucessfully


Next is loading our TEST picture with right path to it. I loaded some more images for this purpose  only - folder __TEST__.

In [15]:
path = '/content/gdrive/MyDrive/Colab Notebooks/ML/single_plate/img_folder/S1.jpg'
image = cv2.imread(path) # PIL object
image = np.array(image,dtype=np.uint8) # 8 bit array (0,255)
image1 = load_img(path,target_size=(224,224))
image_arr_224 = img_to_array(image1)/255.0  # Convert into array and get the normalized output

# Size of the orginal image
h,w,d = image.shape
print('Height of the image =',h)
print('Width of the image =',w)


Height of the image = 2250
Width of the image = 4000


In [16]:
xml_path = '/content/gdrive/MyDrive/Colab Notebooks/ML/single_plate/xml_folder/S1.xml'

In [17]:
info = xet.parse(xml_path)
root = info.getroot()
member_object = root.find('object')
labels_info = member_object.find('bndbox')
xmin_gt = int(labels_info.find('xmin').text)
xmax_gt = int(labels_info.find('xmax').text)
ymin_gt = int(labels_info.find('ymin').text)
ymax_gt = int(labels_info.find('ymax').text)

In [18]:
bb_gt = [xmin_gt, xmax_gt, ymin_gt, ymax_gt]

In [19]:
bb_gt

[1198, 2679, 755, 1603]

Now we can have a look at our image *Figure 13*

In [20]:
fig = px.imshow(image)
fig.update_layout(width=700, height=500,  margin=dict(l=10, r=10, b=10, t=10), xaxis_title='Figure 13 - TEST Image')

Output hidden; open in https://colab.research.google.com to view.

So, let's look into the shape of my image.

In [21]:
image_arr_224.shape

(224, 224, 3)

But in order to pass this image of a model, we need to provide the data in the dynamic fourth dimension. And what one indicates is a number of images. So here we are just passing only one image.

In [22]:
test_arr = image_arr_224.reshape(1,224,224,3)
test_arr.shape

(1, 224, 224, 3)

<p id="part20"></p>

# <span style="font-family: Arials; font-size: 16px; font-style: normal; font-weight: bold; letter-spacing: 3px; text-align: center; color: #000000; line-height:1.0">DE-NORMALIZE THE OUTPUT</span>

In [23]:
# Make predictions
coords = model.predict(test_arr)
coords



array([[0.29366836, 0.6700257 , 0.33562842, 0.7218307 ]], dtype=float32)

We have got the output from the model and output what we got is the normalized output. So, what we need to do is to convert back into our original form values, which actually we did in during the training process, in the training process, we have the original form values and convert that normalized one. So basically, we will de-normalize the values back.

In [24]:
# Denormalize the values
denorm = np.array([w,w,h,h])
coords = coords * denorm
coords

array([[1174.67343807, 2680.10282516,  755.16394526, 1624.11913276]])

<p id="part21"></p>

# <span style="font-family: Arials; font-size: 16px; font-style: normal; font-weight: bold; letter-spacing: 3px; text-align: center; color: #000000; line-height:1.0">BOUNDING BOX</span>

Now we will draw bounding box on top of the image. I just want to provide the two diagonal points. Let's make use of these points and let's draw the rectangle box.

In [25]:
coords = coords.astype(np.int32)
coords

array([[1174, 2680,  755, 1624]], dtype=int32)

In [26]:
# Draw bounding on top the image
xmin_pred, xmax_pred, ymin_pred, ymax_pred = coords[0]
pt1 =(xmin_pred, ymin_pred)
pt2 =(xmax_pred, ymax_pred)
print(pt1, pt2)

(1174, 755) (2680, 1624)


In [27]:
bb_pred = [xmin_pred, xmax_pred, ymin_pred, ymax_pred]

In [28]:
bb_pred

[1174, 2680, 755, 1624]

In [29]:
cv2.rectangle(image,pt1,pt2,(0,255,0),3)
cv2.rectangle(image,(xmin_gt, ymin_gt,), (xmax_gt, ymax_gt), (255,0,0), 3)
fig = px.imshow(image)
fig.update_layout(width=700, height=500, margin=dict(l=10, r=10, b=10, t=10))

Output hidden; open in https://colab.research.google.com to view.

In [51]:
# Link code: https://gist.github.com/meyerjo/dd3533edc97c81258898f60d8978eddc
def get_iou(boxA, boxB):
    # determine the (x, y)-coordinates of the intersection rectangle
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    # compute the area of intersection rectangle
    interArea = abs(max((xB - xA, 0)) * max((yB - yA), 0))
    if interArea == 0:
        return 0
    # compute the area of both the prediction and ground-truth
    # rectangles
    boxAArea = abs((boxA[2] - boxA[0]) * (boxA[3] - boxA[1]))
    boxBArea = abs((boxB[2] - boxB[0]) * (boxB[3] - boxB[1]))

    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = interArea / float(boxAArea + boxBArea - interArea)

    # return the intersection over union value
    return iou

In [52]:
bb_gt, bb_pred

([1198, 2679, 755, 1603], [1174, 2680, 755, 1624])

In [53]:
get_iou(bb_gt, bb_pred)

0

### Calculate average IOU

In [48]:
all_pred = model.predict(x_test)
all_gt = y_test



In [50]:
all_iou = 0
for i in range(len(all_gt)):
  all_iou += get_iou(all_pred[i], all_gt[i])

ave_iou = all_iou / len(all_gt)
print(ave_iou)

0.11571173981671658


<p id="part23"></p>

# <span style="font-family: Arials; font-size: 20px; font-style: normal; font-weight: bold; letter-spacing: 3px; text-align: center; color: #000000; line-height:1.0">OPTICAL CHARACTER RECOGNITION - OCR</span>
<hr style="height: 0.5px; border: 0; background-color: #000000">

<p id="part24"></p>

# <span style="font-family: Arials; font-size: 16px; font-style: normal; font-weight: bold; letter-spacing: 3px; text-align: center; color: #000000; line-height:1.0">TESSERACT OCR</span>

Optical character recognition (OCR) software that is used to extract text from the image. Tesseract OCR have a python API and it is open source. Firstly, we will do installation of it. It pretty simple and depend on you OS. You can find manual and files to download for installation [here](https://guides.library.illinois.edu/c.php?g=347520&p=4121425).

<p id="part25"></p>

# <span style="font-family: Arials; font-size: 16px; font-style: normal; font-weight: bold; letter-spacing: 3px; text-align: center; color: #000000; line-height:1.0">LIMITATIONS OF PYTESSERACT</span>

Tesseract works best when there is a clean segmentation of the foreground text from the background. In practice, it can be extremely challenging to guarantee these types of setups. There are a variety of reasons you might not get good quality output from Tesseract like if the image has noise on the background. The better the image quality (size, contrast, lightning) the better the recognition result. It requires a bit of preprocessing to improve the OCR results, images need to be scaled appropriately, have as much image contrast as possible, and the text must be horizontally aligned. Tesseract OCR is quite powerful but does have the following limitations.

__Tesseract limitations summed in the list.__
<ul>
  <li>The OCR is not as accurate as some commercial solutions available to us.</li>
  <li>Doesn't do well with images affected by artifacts including partial occlusion, distorted perspective, and complex background.</li>
  <li>It is not capable of recognizing handwriting.</li>
  <li>It may find gibberish and report this as OCR output.</li>
  <li>If a document contains languages outside of those given in the -l LANG arguments, results may be poor.</li>  
  <li>It is not always good at analyzing the natural reading order of documents. For example, it may fail to recognize that a document contains two columns, and may try to join text across columns.</li>
  <li>Poor quality scans may produce poor quality OCR.</li>
  <li>It does not expose information about what font family text belongs to.</li>
</ul>

<p id="part26"></p>

# <span style="font-family: Arials; font-size: 16px; font-style: normal; font-weight: bold; letter-spacing: 3px; text-align: center; color: #000000; line-height:1.0">EXTRACT NUMBER PLATE TEXT FROM IMAGE</span>

Firstly, we will load our image and convert to array. Crop our bounding box with coordinates of it. We will identify region of interest (ROI) and have look at our cropped image *Figure 15*.

In [54]:
img = np.array(cv2.imread(path))
xmin_pred,xmax_pred,ymin_pred,ymax_pred = coords[0]
roi = img[ymin_pred:ymax_pred,xmin_pred:xmax_pred]
fig = px.imshow(roi)
fig.update_layout(width=350, height=250, margin=dict(l=10, r=10, b=10, t=10),xaxis_title='Figure 15 Cropped image')

With use of tesseract, we will extract the text from the mage.


In [55]:
!pip install easyocr

Collecting easyocr
  Downloading easyocr-1.7.1-py3-none-any.whl (2.9 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/2.9 MB[0m [31m10.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.9/2.9 MB[0m [31m54.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.4.2-py2.py3-none-any.whl (30 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.3.0.post5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (908 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m908.3/908.3 kB[0m [31m69.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ninja (from easyocr)
  Downloading ninja-1.11.1.1-p

In [57]:
import easyocr
reader = easyocr.Reader(['en'])
result = reader.readtext(roi)
text = ""
for res in result:
    text += res[1]

print(text)

74-H1096.86
