# Data Preprocessing

## Imports

#### Import Libraries

In [18]:
import cv2
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [3]:
import os 
from glob import glob
import xml.etree.ElementTree as xet

#### Data Loading

$\text{Let's get all the all the xml files to get bounding box}$

In [4]:
os.getcwd()

'c:\\Users\\kames\\OneDrive\\Desktop\\cars-number-plate-recognition-project\\notebooks'

In [5]:
os.path.join('..', 'data', 'raw', 'images', '*xml')

'..\\data\\raw\\images\\*xml'

In [6]:
glob(os.path.join('..', 'data', 'raw', 'images', '*xml'))

['..\\data\\raw\\images\\N1.xml',
 '..\\data\\raw\\images\\N100.xml',
 '..\\data\\raw\\images\\N101.xml',
 '..\\data\\raw\\images\\N102.xml',
 '..\\data\\raw\\images\\N103.xml',
 '..\\data\\raw\\images\\N104.xml',
 '..\\data\\raw\\images\\N105.xml',
 '..\\data\\raw\\images\\N106.xml',
 '..\\data\\raw\\images\\N107.xml',
 '..\\data\\raw\\images\\N108.xml',
 '..\\data\\raw\\images\\N109.xml',
 '..\\data\\raw\\images\\N11.xml',
 '..\\data\\raw\\images\\N110.xml',
 '..\\data\\raw\\images\\N111.xml',
 '..\\data\\raw\\images\\N112.xml',
 '..\\data\\raw\\images\\N113.xml',
 '..\\data\\raw\\images\\N114.xml',
 '..\\data\\raw\\images\\N116.xml',
 '..\\data\\raw\\images\\N117.xml',
 '..\\data\\raw\\images\\N118.xml',
 '..\\data\\raw\\images\\N119.xml',
 '..\\data\\raw\\images\\N12.xml',
 '..\\data\\raw\\images\\N120.xml',
 '..\\data\\raw\\images\\N121.xml',
 '..\\data\\raw\\images\\N122.xml',
 '..\\data\\raw\\images\\N123.xml',
 '..\\data\\raw\\images\\N124.xml',
 '..\\data\\raw\\images\\N126.xm

In [7]:
glob(os.path.join('..', 'data', 'raw', 'images', '*xml'))[0]

'..\\data\\raw\\images\\N1.xml'

In [8]:
(
    xet
    .parse
    (
        glob(os.path.join('..', 'data', 'raw', 'images', '*xml'))[0]
    )
)

<xml.etree.ElementTree.ElementTree at 0x20031426b90>

In [9]:
(
    xet
    .parse
    (
        glob(os.path.join('..', 'data', 'raw', 'images', '*xml'))[0]
    ).getroot()
)

<Element 'annotation' at 0x00000200315109F0>

In [10]:
(
    xet
    .parse
    (
     glob(os.path.join('..', 'data', 'raw', 'images', '*xml'))[0]
    )
    .getroot()
    .find('object')
    .find('bndbox')
    .find('xmin').text
)

'1093'

In [11]:
(
    xet
    .parse
    (
     glob(os.path.join('..', 'data', 'raw', 'images', '*xml'))[0]
    )
    .getroot()
    .find('object')
    .find('bndbox')
    .find('xmax').text
)

'1396'

In [12]:
(
    xet
    .parse
    (
     glob(os.path.join('..', 'data', 'raw', 'images', '*xml'))[0]
    )
    .getroot()
    .find('object')
    .find('bndbox')
    .find('ymin').text
)

'645'

In [13]:
(
    xet
    .parse
    (
     glob(os.path.join('..', 'data', 'raw', 'images', '*xml'))[0]
    )
    .getroot()
    .find('object')
    .find('bndbox')
    .find('ymax').text
)

'727'

#### Function

In [14]:
xml_paths = glob(os.path.join('..', 'data', 'raw', 'images', '*xml'))
label_dict = dict(file_path = [], xmin=[], xmax=[], ymin=[], ymax=[])
label_dict

{'file_path': [], 'xmin': [], 'xmax': [], 'ymin': [], 'ymax': []}

In [15]:
for file in xml_paths:
    data = xet.parse(file)
    root = data.getroot()
    object_ = root.find('object')
    bnd_boxtag = object_.find('bndbox')
    label_dict['file_path'].append(file)
    label_dict['xmin'].append(int(bnd_boxtag.find('xmin').text))
    label_dict['xmax'].append(int(bnd_boxtag.find('xmax').text))
    label_dict['ymin'].append(int(bnd_boxtag.find('ymin').text))
    label_dict['ymax'].append(int(bnd_boxtag.find('ymax').text))

In [16]:
label_dict

{'file_path': ['..\\data\\raw\\images\\N1.xml',
  '..\\data\\raw\\images\\N100.xml',
  '..\\data\\raw\\images\\N101.xml',
  '..\\data\\raw\\images\\N102.xml',
  '..\\data\\raw\\images\\N103.xml',
  '..\\data\\raw\\images\\N104.xml',
  '..\\data\\raw\\images\\N105.xml',
  '..\\data\\raw\\images\\N106.xml',
  '..\\data\\raw\\images\\N107.xml',
  '..\\data\\raw\\images\\N108.xml',
  '..\\data\\raw\\images\\N109.xml',
  '..\\data\\raw\\images\\N11.xml',
  '..\\data\\raw\\images\\N110.xml',
  '..\\data\\raw\\images\\N111.xml',
  '..\\data\\raw\\images\\N112.xml',
  '..\\data\\raw\\images\\N113.xml',
  '..\\data\\raw\\images\\N114.xml',
  '..\\data\\raw\\images\\N116.xml',
  '..\\data\\raw\\images\\N117.xml',
  '..\\data\\raw\\images\\N118.xml',
  '..\\data\\raw\\images\\N119.xml',
  '..\\data\\raw\\images\\N12.xml',
  '..\\data\\raw\\images\\N120.xml',
  '..\\data\\raw\\images\\N121.xml',
  '..\\data\\raw\\images\\N122.xml',
  '..\\data\\raw\\images\\N123.xml',
  '..\\data\\raw\\images\\N12

#### Convert Label_Dict to Dataframe

In [17]:
df = pd.DataFrame(label_dict)
df.head()

Unnamed: 0,file_path,xmin,xmax,ymin,ymax
0,..\data\raw\images\N1.xml,1093,1396,645,727
1,..\data\raw\images\N100.xml,134,301,312,350
2,..\data\raw\images\N101.xml,31,139,128,161
3,..\data\raw\images\N102.xml,164,316,216,243
4,..\data\raw\images\N103.xml,813,1067,665,724


---