# Data Cleaning

In [20]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et

In [21]:
# load all xml files and store in a list
xml_list = glob('./data_images/*.xml')
# replace \\ with /
xml_list = list(map(lambda x:  x.replace('\\', '/'), xml_list))

In [22]:
print(xml_list)

['./data_images/00001.xml', './data_images/00002.xml', './data_images/00003.xml', './data_images/00004.xml', './data_images/00005.xml', './data_images/00006.xml', './data_images/00007.xml', './data_images/00008.xml', './data_images/00009.xml', './data_images/00010.xml', './data_images/00011.xml', './data_images/00012.xml', './data_images/00013.xml', './data_images/00014.xml', './data_images/00015.xml', './data_images/00016.xml', './data_images/00017.xml', './data_images/00018.xml', './data_images/00019.xml', './data_images/00020.xml', './data_images/00021.xml', './data_images/00022.xml', './data_images/00023.xml', './data_images/00024.xml', './data_images/00025.xml', './data_images/00026.xml', './data_images/00027.xml', './data_images/00028.xml', './data_images/00029.xml', './data_images/00030.xml', './data_images/00031.xml', './data_images/00032.xml', './data_images/00033.xml', './data_images/00034.xml', './data_images/00035.xml', './data_images/00036.xml', './data_images/00037.xml', 

In [23]:
# step-2 read xml file
# from each xml file, we need to extract 
# filename, size(width, height), object(name, xmin, xmax, ymin, ymax)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()
    
    # extract file name 
    image_name = root.find('filename').text
    # width and height of the image 
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name, xmin, xmax, ymin, ymax])
        
    return parser

In [24]:
parser_all = list(map(extract_text, xml_list))

In [25]:
data = reduce(lambda x, y : x+y, parser_all)

In [26]:
data

[['00001.jpg', '1300', '957', 'With Seatbelt', '248', '750', '130', '811'],
 ['00002.jpg', '800', '534', 'With Seatbelt', '162', '425', '57', '499'],
 ['00003.jpg', '500', '334', 'With Seatbelt', '54', '419', '11', '333'],
 ['00004.jpg', '626', '417', 'With Seatbelt', '79', '357', '45', '382'],
 ['00005.jpg', '626', '417', 'With Seatbelt', '42', '316', '44', '417'],
 ['00006.jpg', '626', '418', 'With Seatbelt', '97', '480', '27', '353'],
 ['00007.jpg', '626', '417', 'With Seatbelt', '269', '626', '71', '417'],
 ['00008.jpg', '626', '417', 'With Seatbelt', '12', '362', '40', '353'],
 ['00009.jpg', '1300', '957', 'With Seatbelt', '216', '944', '118', '820'],
 ['00010.jpg', '626', '417', 'With Seatbelt', '109', '364', '39', '417'],
 ['00011.jpg', '626', '418', 'With Seatbelt', '141', '522', '35', '365'],
 ['00012.jpg', '626', '417', 'With Seatbelt', '198', '626', '67', '348'],
 ['00013.jpg', '626', '418', 'With Seatbelt', '77', '308', '102', '418'],
 ['00014.jpg', '1500', '1101', 'With Se

In [27]:
len(data)

891

In [28]:
df = pd.DataFrame(data, columns = ['filename', 'width', 'height', 'name', 'xmin', 'xmax', 'ymin', 'ymax' ])

In [29]:
df

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,00001.jpg,1300,957,With Seatbelt,248,750,130,811
1,00002.jpg,800,534,With Seatbelt,162,425,57,499
2,00003.jpg,500,334,With Seatbelt,54,419,11,333
3,00004.jpg,626,417,With Seatbelt,79,357,45,382
4,00005.jpg,626,417,With Seatbelt,42,316,44,417
...,...,...,...,...,...,...,...,...
886,00758.jpg,626,417,Without Seatbelt,298,591,20,396
887,00759.jpg,852,480,Without Seatbelt,287,635,14,440
888,00760.jpg,1300,956,Without Seatbelt,291,835,83,865
889,00761.jpg,800,534,Without Seatbelt,138,518,35,479


In [30]:
df.shape

(891, 8)

In [31]:
df['name'].value_counts()

name
With Seatbelt       504
Without Seatbelt    387
Name: count, dtype: int64

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  891 non-null    object
 1   width     891 non-null    object
 2   height    891 non-null    object
 3   name      891 non-null    object
 4   xmin      891 non-null    object
 5   xmax      891 non-null    object
 6   ymin      891 non-null    object
 7   ymax      891 non-null    object
dtypes: object(8)
memory usage: 55.8+ KB


In [33]:
# type conversion
cols = ['width', 'height', 'xmin', 'xmax', 'ymin', 'ymax' ]
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  891 non-null    object
 1   width     891 non-null    int32 
 2   height    891 non-null    int32 
 3   name      891 non-null    object
 4   xmin      891 non-null    int32 
 5   xmax      891 non-null    int32 
 6   ymin      891 non-null    int32 
 7   ymax      891 non-null    int32 
dtypes: int32(6), object(2)
memory usage: 34.9+ KB


In [34]:
# center x, center y
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']
# w
df['w'] = (df['xmax']-df['xmin'])/df['width']
# h
df['h'] = (df['ymax']-df['ymin'])/df['height']

In [35]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,00001.jpg,1300,957,With Seatbelt,248,750,130,811,0.383846,0.491641,0.386154,0.711599
1,00002.jpg,800,534,With Seatbelt,162,425,57,499,0.366875,0.520599,0.32875,0.827715
2,00003.jpg,500,334,With Seatbelt,54,419,11,333,0.473,0.51497,0.73,0.964072
3,00004.jpg,626,417,With Seatbelt,79,357,45,382,0.348243,0.51199,0.444089,0.808153
4,00005.jpg,626,417,With Seatbelt,42,316,44,417,0.285942,0.552758,0.4377,0.894484


# Splitting data into Train & Test

In [36]:
images = df['filename'].unique()

In [37]:
len(images)

762

In [38]:
# 80% train
# 20% test
img_df = pd.DataFrame(images, columns = ['filename'])
img_train = tuple(img_df.sample(frac = 0.8)['filename']) # shuffle and pick 80% of images

In [39]:
print(img_train)

('00287.jpg', '00151.jpg', '00567.jpg', '00489.jpg', '00586.jpg', '00660.jpg', '00462.jpg', '00558.jpg', '00686.jpg', '00293.jpg', '00511.jpg', '00263.jpg', '00061.jpg', '00248.jpg', '00262.jpg', '00063.jpg', '00716.jpg', '00387.jpg', '00359.jpg', '00085.jpg', '00422.jpg', '00598.jpg', '00476.jpg', '00230.jpg', '00695.jpg', '00338.jpg', '00384.jpg', '00342.jpg', '00218.jpg', '00642.jpg', '00500.jpg', '00280.jpg', '00673.jpg', '00312.jpg', '00303.jpg', '00412.jpg', '00395.jpg', '00326.jpg', '00705.jpg', '00641.jpg', '00547.jpg', '00393.jpg', '00759.jpg', '00614.jpg', '00499.jpg', '00314.jpg', '00559.jpg', '00402.jpg', '00255.jpg', '00665.jpg', '00208.jpg', '00036.jpg', '00119.jpg', '00613.jpg', '00339.jpg', '00096.jpg', '00631.jpg', '00590.jpg', '00570.jpg', '00086.jpg', '00136.jpg', '00350.jpg', '00298.jpg', '00729.jpg', '00040.jpg', '00272.jpg', '00678.jpg', '00174.jpg', '00185.jpg', '00301.jpg', '00200.jpg', '00666.jpg', '00633.jpg', '00077.jpg', '00715.jpg', '00039.jpg', '00226.jpg'

In [40]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename']) # shuffle and pick 20% of images

In [41]:
print(img_test)

('00002.jpg', '00004.jpg', '00005.jpg', '00009.jpg', '00015.jpg', '00018.jpg', '00019.jpg', '00024.jpg', '00027.jpg', '00031.jpg', '00034.jpg', '00037.jpg', '00041.jpg', '00049.jpg', '00056.jpg', '00058.jpg', '00064.jpg', '00068.jpg', '00078.jpg', '00079.jpg', '00080.jpg', '00083.jpg', '00084.jpg', '00092.jpg', '00106.jpg', '00108.jpg', '00112.jpg', '00115.jpg', '00118.jpg', '00126.jpg', '00127.jpg', '00139.jpg', '00141.jpg', '00142.jpg', '00143.jpg', '00145.jpg', '00146.jpg', '00153.jpg', '00155.jpg', '00164.jpg', '00170.jpg', '00181.jpg', '00190.jpg', '00203.jpg', '00204.jpg', '00212.jpg', '00220.jpg', '00227.jpg', '00233.jpg', '00235.jpg', '00236.jpg', '00240.jpg', '00243.jpg', '00246.jpg', '00253.jpg', '00257.jpg', '00259.jpg', '00264.jpg', '00271.jpg', '00283.jpg', '00286.jpg', '00291.jpg', '00311.jpg', '00313.jpg', '00317.jpg', '00318.jpg', '00322.jpg', '00328.jpg', '00343.jpg', '00347.jpg', '00348.jpg', '00358.jpg', '00361.jpg', '00363.jpg', '00364.jpg', '00366.jpg', '00372.jpg'

In [42]:
print(len(img_train))
print(len(img_test))

610
152


In [43]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [44]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,00001.jpg,1300,957,With Seatbelt,248,750,130,811,0.383846,0.491641,0.386154,0.711599
2,00003.jpg,500,334,With Seatbelt,54,419,11,333,0.473,0.51497,0.73,0.964072
5,00006.jpg,626,418,With Seatbelt,97,480,27,353,0.460863,0.454545,0.611821,0.779904
6,00007.jpg,626,417,With Seatbelt,269,626,71,417,0.714856,0.585132,0.570288,0.829736
7,00008.jpg,626,417,With Seatbelt,12,362,40,353,0.298722,0.471223,0.559105,0.7506


In [45]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
1,00002.jpg,800,534,With Seatbelt,162,425,57,499,0.366875,0.520599,0.32875,0.827715
3,00004.jpg,626,417,With Seatbelt,79,357,45,382,0.348243,0.51199,0.444089,0.808153
4,00005.jpg,626,417,With Seatbelt,42,316,44,417,0.285942,0.552758,0.4377,0.894484
8,00009.jpg,1300,957,With Seatbelt,216,944,118,820,0.446154,0.490073,0.56,0.733542
14,00015.jpg,1300,953,With Seatbelt,467,1222,41,844,0.649615,0.464323,0.580769,0.842602


# Label Encoding - Assigning numbers to object names

In [48]:
def label_encoding(x):
    labels = {'With Seatbelt':0, 'Without Seatbelt':1}
    return labels[x]

In [49]:
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['id'] = train_df['name'].apply(label_encoding)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['id'] = test_df['name'].apply(label_encoding)


In [50]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,00001.jpg,1300,957,With Seatbelt,248,750,130,811,0.383846,0.491641,0.386154,0.711599,0
2,00003.jpg,500,334,With Seatbelt,54,419,11,333,0.473,0.51497,0.73,0.964072,0
5,00006.jpg,626,418,With Seatbelt,97,480,27,353,0.460863,0.454545,0.611821,0.779904,0
6,00007.jpg,626,417,With Seatbelt,269,626,71,417,0.714856,0.585132,0.570288,0.829736,0
7,00008.jpg,626,417,With Seatbelt,12,362,40,353,0.298722,0.471223,0.559105,0.7506,0
9,00010.jpg,626,417,With Seatbelt,109,364,39,417,0.377796,0.546763,0.407348,0.906475,0
10,00011.jpg,626,418,With Seatbelt,141,522,35,365,0.529553,0.478469,0.608626,0.789474,0
11,00012.jpg,626,417,With Seatbelt,198,626,67,348,0.658147,0.497602,0.683706,0.673861,0
12,00013.jpg,626,418,With Seatbelt,77,308,102,418,0.307508,0.62201,0.36901,0.755981,0
13,00014.jpg,1500,1101,With Seatbelt,28,973,100,995,0.333667,0.497275,0.63,0.812897,0


In [51]:
test_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
1,00002.jpg,800,534,With Seatbelt,162,425,57,499,0.366875,0.520599,0.32875,0.827715,0
3,00004.jpg,626,417,With Seatbelt,79,357,45,382,0.348243,0.51199,0.444089,0.808153,0
4,00005.jpg,626,417,With Seatbelt,42,316,44,417,0.285942,0.552758,0.4377,0.894484,0
8,00009.jpg,1300,957,With Seatbelt,216,944,118,820,0.446154,0.490073,0.56,0.733542,0
14,00015.jpg,1300,953,With Seatbelt,467,1222,41,844,0.649615,0.464323,0.580769,0.842602,0
17,00018.jpg,1500,1246,With Seatbelt,4,1132,117,1121,0.378667,0.49679,0.752,0.805778,0
18,00019.jpg,626,434,With Seatbelt,206,626,17,385,0.664537,0.463134,0.670927,0.847926,0
24,00024.jpg,1300,956,With Seatbelt,5,738,99,763,0.285769,0.450837,0.563846,0.694561,0
27,00027.jpg,1300,956,With Seatbelt,524,779,49,715,0.501154,0.399582,0.196154,0.696653,0
28,00027.jpg,1300,956,With Seatbelt,654,1277,1,866,0.742692,0.453452,0.479231,0.904812,0


# Save Images and Labels in text form in their respective folders

In [52]:
import os
from shutil import move

In [53]:
train_folder = 'data_images/train'
test_folder = 'data_images/test'

os.mkdir(train_folder)
os.mkdir(test_folder)

In [54]:
cols = ['filename', 'id', 'center_x', 'center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [55]:
def save_data(filename, folder_path, group_obj):
    # move images
    src = os.path.join('data_images', filename)
    dst = os.path.join(folder_path, filename)
    move(src, dst) # move img to the destination folder

    # move labels
    text_filename = os.path.join(folder_path, os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename, sep = ' ', index = False, header = False)

In [56]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [57]:
filename_series.apply(save_data, args = (train_folder, groupby_obj_train))

0      None
1      None
2      None
3      None
4      None
       ... 
605    None
606    None
607    None
608    None
609    None
Length: 610, dtype: object

In [58]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())

In [59]:
filename_series_test.apply(save_data, args = (test_folder, groupby_obj_test))

0      None
1      None
2      None
3      None
4      None
       ... 
147    None
148    None
149    None
150    None
151    None
Length: 152, dtype: object