# Data Exploration
With the image augmented, the newly generated images were also labeled using labelImg. Combining this new set with the modified set, the following notebook is used to count the number of total labels as well as the percentage of each subset to the total label count. 

In [1]:
# Importing Dependencies
import pandas as pd
import os
import xml.etree.ElementTree as ET

In [2]:
# Generating path to datasets
curr_dir = os.getcwd()
images_dir = os.path.join(curr_dir, 'images')

In [3]:
# Creating function to parse XML data
def xml_reader(dir_path):
    labels = []
    files = []
    for file in sorted(os.listdir(dir_path)):
        if file.endswith('.xml'):
            xml_file = os.path.join(images_dir, file)
            files.append(xml_file)
            xtree = ET.parse(xml_file)
            xroot = xtree.getroot()
            for child in xroot:
                if child.tag == 'object':
                    for data_name in child:
                        if data_name.tag == 'name':
                            labels.append(data_name.text)
    return labels, files

In [4]:
labels, files = xml_reader(images_dir)

In [5]:
labels_dict = {'trafficlight':labels.count('trafficlight'),
               'stop':labels.count('stop'),
               'speedlimit':labels.count('speedlimit'),
               'crosswalk':labels.count('crosswalk'),
               'nostop':labels.count('nostop'),
               'yield':labels.count('yield')}

In [6]:
# Creating Pandas DataFrame for labels
labels_df = pd.DataFrame(labels_dict.items(), columns=['label','count'])

In [7]:
labels_df

Unnamed: 0,label,count
0,trafficlight,176
1,stop,93
2,speedlimit,862
3,crosswalk,298
4,nostop,107
5,yield,111


In [8]:
labels_df['subset size (%)'] = round(((labels_df['count'] / labels_df['count'].sum()) * 100), 2)

In [9]:
labels_df

Unnamed: 0,label,count,subset size (%)
0,trafficlight,176,10.69
1,stop,93,5.65
2,speedlimit,862,52.34
3,crosswalk,298,18.09
4,nostop,107,6.5
5,yield,111,6.74


In [10]:
print("Total label count: {}".format(labels_df['count'].sum()))

Total label count: 1647


In [11]:
print("Total number of files: {}".format(len(files)))

Total number of files: 977
