In [7]:
"""
Modules - include all modules here
"""
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib as plt
import json
import os
import cv2
from pprint import pprint

In [28]:
"""
List of classes
"""

# DataElement Class
class DataElement:
    """
    This object contains variables for a single data
    @author: Kevin Jang (kj460)
    """
    label = None        # label (tag number) of this plate
    height = None       # height of this plate
    width = None        # width of this plate
    label_length = None # number of characters in this plate's label
    img = None          # parsed image
    tags = []           # data tags
    
    def __init__( self, label, height, width, label_length, img, tags ):
        """
        Constructor for this class
        @author: Kevin Jang (kj460)
        """
        self.label = label
        self.height = height
        self.width = width
        self.label_length = label_length
        self.img = img
        self.tags = tags
    
    def __str__( self ):
        """
        Returns the value of each variable for this class
        @author: Kevin Jang (kj460)
        """
        return "\t** Values:\n\t\t* label : {}\n\t\t* height : {}\n\t\t* width : {}\n\t\t* label_length = {}\n\t\t* img = {}\n\t\t* tags = {}\n".format( self.label, self.height, self.width, self.label_length, self.img, self.tags )
    
    def __repr__(self):
        """
        @author: Kevin Jang (kj460)
        """
        return self.__str__()

# DataSet Class
class DataSet:
    """
    This object contains the list of DataElement for a single dataset
    @author: Kevin Jang (kj460)
    """
    data_path = None  # path of the directory that contains data files
    data = {}         # list of DataElement
    num_redundant = 0 # number of redundant data
    num_missing = 0   # number of missing data
    num_noisy = 0     # number of noisy data

    def __init__( self, data_path ):
        """
        Constructor for this class
        @author: Kevin Jang (kj460)
        """
        self.data_path = data_path
    
    def __str__( self ):
        """
        Returns the state of each DataElement in the data list
        @author: Kevin Jang (kj460)
        """
        string = ""
        for label, data_elem in self.data.items():
            string += str( data_elem )
        return string
    
    def valid_json( self, json_data ):
        """
        Returns True if a given json_data is valid else return False
        @author: Kevin Jang (kj460)
        """
        if 'description' not in json_data or 'size' not in json_data or 'height' not in json_data[ 'size'] or 'width' not in json_data[ 'size' ] or 'tags' not in json_data or len( json_data[ 'tags' ] ) == 0:
            self.num_missing += 1
            return False
        elif len( json_data[ 'description' ] ) != 8 or int( json_data[ 'size' ][ 'width' ] ) != 152 or int( json_data[ 'size' ][ 'height' ] ) != 34:
            self.num_noisy += 1
            return False
        elif json_data[ 'description' ] in self.data.keys():
            self.num_redundant += 1
            return False
        else:
            return True
    
    def build_dataset( self, num_files ):
        """
        Reads data files and create DataElement for each data file and include it in the data list
        @author: Kevin Jang (kj460)
        @params:
            num_files - number of data files to be read; set this to 'None' to read all the files
        """
        ann_path = self.data_path + '/ann'
        img_path = self.data_path + '/img'
        count = 0
        for file in os.listdir( ann_path ):
            if num_files != None and count >= num_files:
                # stop reading
                break
            # file validation
            if '.json' not in file:
                continue
            # process JSON file
            json_file = open( ann_path + '/' + file )
            json_data = json.load( json_file )
            # data validation
            if not self.valid_json( json_data ):
                continue
            # create a new DataElement
            data_elem = DataElement( json_data[ 'description' ],
                                   json_data[ 'size' ][ 'height' ],
                                   json_data[ 'size' ][ 'width' ],
                                   len( json_data[ 'description' ] ),
                                   None,
                                   json_data[ 'tags' ] )
            # process PNG file
            img_file = cv2.imread( img_path + '/' + ( file.split( '.json' )[ 0 ] ) + '.png' )
            img_file = cv2.cvtColor( img_file, cv2.COLOR_BGR2GRAY )
            img_file = cv2.resize( img_file, ( data_elem.width, data_elem.height ) )
            img_file = img_file.astype( np.float32 ) / 255
            # add a new DataElement to the list
            data_elem.img = img_file
            self.data[ data_elem.label ] = data_elem
            count += 1
            
# TrainTestDataSet Class
class TrainTestDataSet:
    """
    This object contains DataSet for training and testing
    @author: Kevin Jang (kj460)
    """
    train_data_path = None # path of the directory that contains training data files
    test_data_path = None  # path of the directory that contains testing data files
    train_dataset = None   # training DataSet object
    test_dataset = None    # testing DataSet object
    
    def __init__( self, train_data_path, test_data_path ):
        """
        Constructor for this class
        @author: Kevin Jang (kj460)
        """
        self.train_data_path = train_data_path
        self.test_data_path = test_data_path
        
    def __str__( self ):
        """
        Returns the string that contains information about training and testing dataset
        @author: Kevin Jang (kj460)
        """
        return '*** Training Set ***\n' + str( self.train_dataset ) + '\n*** Testing Set ***\n' + str( self.test_dataset )
    
    def build_train_test_dataset( self ):
        """
        Builds training and testing DataSet
        @author: Kevin Jang (kj460)
        """
        self.train_dataset = DataSet( self.train_data_path )
        self.train_dataset.build_dataset( None )
        self.test_dataset = DataSet( self.test_data_path )
        self.test_dataset.build_dataset( None )


In [29]:
# DataValidator Class
class DataValidator:
    '''
    Validator class to check the data cleanliness
    @author: Kevin Jang (kj460)
    '''
    train_test_dataset = None
    
    def __init__( self, train_test_dataset ):
        """
        Constructor for this class
        @author: Kevin Jang (kj460)
        """
        self.train_test_dataset = train_test_dataset
        
    def __str__( self ):
        """
        Returns the string that contains information about validation on both training and testing dataset
        @author: Kevin Jang (kj460)
        """
        str = '*** DataSet Validation ***\n'
        str += '\t** Number of Redundant Data\n'
        str += '\t\t* Training DataSet : {}\n'.format( self.train_test_dataset.train_dataset.num_redundant )
        str += '\t\t* Testing DataSet : {}\n'.format( self.train_test_dataset.test_dataset.num_redundant )
        str += '\t** Number of Missing Data\n'
        str += '\t\t* Training DataSet : {}\n'.format( self.train_test_dataset.train_dataset.num_missing )
        str += '\t\t* Testing DataSet : {}\n'.format( self.train_test_dataset.test_dataset.num_missing )
        str += '\t** Number of Noisy Data\n'
        str += '\t\t* Training DataSet : {}\n'.format( self.train_test_dataset.train_dataset.num_noisy )
        str += '\t\t* Testing DataSet : {}\n'.format( self.train_test_dataset.test_dataset.num_noisy )
        return str
        

In [30]:
# directiory that contains the data files
train_data_path = 'data/ANPR_OCR__train'
test_data_path = 'data/ANPR_OCR__test'

# create TrainTestDataSet
train_test_dataset = TrainTestDataSet( train_data_path, test_data_path )
train_test_dataset.build_train_test_dataset()

# print the dataset
# print( str( train_test_dataset ) )

In [31]:
# check the cleanliness of the dataset
data_validator = DataValidator( train_test_dataset )
print( str ( data_validator ) )

*** DataSet Validation ***
	** Number of Redundant Data
		* Training DataSet : 0
		* Testing DataSet : 0
	** Number of Missing Data
		* Training DataSet : 0
		* Testing DataSet : 0
	** Number of Noisy Data
		* Training DataSet : 0
		* Testing DataSet : 0

