In [1]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et

In [2]:
# step-1: get path of each xml file
xmlfiles = glob('./data_images/*.xml')
# replace \\ with /
replace_text = lambda x: x.replace('\\','/')
xmlfiles = list(map(replace_text,xmlfiles))

In [3]:
xmlfiles

['./data_images/20230915_211711.xml',
 './data_images/20230915_211714.xml',
 './data_images/20230915_211717.xml',
 './data_images/20230915_211728.xml',
 './data_images/20230915_211731.xml',
 './data_images/20230915_211733.xml',
 './data_images/20230915_211736.xml',
 './data_images/20230915_211740.xml',
 './data_images/20230915_211743.xml',
 './data_images/20230915_211748.xml',
 './data_images/20230915_211802.xml',
 './data_images/20230915_211805.xml',
 './data_images/20230915_211809.xml',
 './data_images/20230915_211812.xml',
 './data_images/20230915_211816.xml',
 './data_images/20230915_211821.xml',
 './data_images/20230915_211824.xml',
 './data_images/20230915_211827.xml',
 './data_images/20230915_211832.xml',
 './data_images/20230915_211835.xml',
 './data_images/20230915_211840.xml',
 './data_images/20230915_211848.xml',
 './data_images/20230915_211851.xml',
 './data_images/20230915_211853.xml',
 './data_images/20230915_211854.xml',
 './data_images/20230915_211856.xml',
 './data_ima

In [4]:
# step-2: read xml files
# from each xml file we need to extract
# filename, size(width, height), object(name, xmin, xmax, ymin, ymax)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()
    
    # extract filename
    image_name = root.find('filename').text
    # width and height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:    
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name,xmin,xmax,ymin,ymax])
        
        return parser

In [5]:
parser_all = list(map(extract_text,xmlfiles))

In [6]:
data = reduce(lambda x, y : x+y,parser_all)

In [7]:
df = pd.DataFrame(data,columns = ['filename','width','height','name','xmin','xmax','ymin','ymax'])

In [8]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,20230915_211711.jpg,441,600,Anchor Full Cream Milk Powder - 400.00 g,62,424,76,523
1,20230915_211714.jpg,395,600,Anchor Full Cream Milk Powder - 400.00 g,74,364,49,554
2,20230915_211717.jpg,461,600,Anchor Full Cream Milk Powder - 400.00 g,77,396,70,528
3,20230915_211728.jpg,492,600,Anchor Full Cream Milk Powder - 400.00 g,82,459,82,518
4,20230915_211731.jpg,422,600,Anchor Full Cream Milk Powder - 400.00 g,58,384,59,531


In [9]:
df.shape

(4199, 8)

In [10]:
df['name'].value_counts()

name
Anchor Full Cream Milk Powder - 400.00 g           1304
Baby Cheramy Floral Moisturising Soap - 90.00 g    1244
Maliban Chick Bits - 80.00 g                        911
Prima Kottu Mee Hot & Spicy Noodles - 80.00 g       740
Name: count, dtype: int64

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4199 entries, 0 to 4198
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  4199 non-null   object
 1   width     4199 non-null   object
 2   height    4199 non-null   object
 3   name      4199 non-null   object
 4   xmin      4199 non-null   object
 5   xmax      4199 non-null   object
 6   ymin      4199 non-null   object
 7   ymax      4199 non-null   object
dtypes: object(8)
memory usage: 262.6+ KB


In [12]:
# type conversion
cols = ['width','height','xmin','xmax','ymin','ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4199 entries, 0 to 4198
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  4199 non-null   object
 1   width     4199 non-null   int32 
 2   height    4199 non-null   int32 
 3   name      4199 non-null   object
 4   xmin      4199 non-null   int32 
 5   xmax      4199 non-null   int32 
 6   ymin      4199 non-null   int32 
 7   ymax      4199 non-null   int32 
dtypes: int32(6), object(2)
memory usage: 164.1+ KB


In [13]:
# center x, center y
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']
# w
df['w'] = (df['xmax']-df['xmin'])/df['width']
# h
df['h'] = (df['ymax']-df['ymin'])/df['height']

In [14]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,20230915_211711.jpg,441,600,Anchor Full Cream Milk Powder - 400.00 g,62,424,76,523,0.55102,0.499167,0.820862,0.745
1,20230915_211714.jpg,395,600,Anchor Full Cream Milk Powder - 400.00 g,74,364,49,554,0.55443,0.5025,0.734177,0.841667
2,20230915_211717.jpg,461,600,Anchor Full Cream Milk Powder - 400.00 g,77,396,70,528,0.513015,0.498333,0.691974,0.763333
3,20230915_211728.jpg,492,600,Anchor Full Cream Milk Powder - 400.00 g,82,459,82,518,0.549797,0.5,0.76626,0.726667
4,20230915_211731.jpg,422,600,Anchor Full Cream Milk Powder - 400.00 g,58,384,59,531,0.523697,0.491667,0.772512,0.786667


In [15]:
images = df['filename'].unique()

In [16]:
len(images)

4199

In [17]:
# 80% train and 20% test
img_df = pd.DataFrame(images,columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename']) # shuffle and pick 80% of images

In [18]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename']) # take rest 20% images

In [19]:
len(img_train), len(img_test)

(3359, 840)

In [20]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [21]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
2,20230915_211717.jpg,461,600,Anchor Full Cream Milk Powder - 400.00 g,77,396,70,528,0.513015,0.498333,0.691974,0.763333
3,20230915_211728.jpg,492,600,Anchor Full Cream Milk Powder - 400.00 g,82,459,82,518,0.549797,0.5,0.76626,0.726667
4,20230915_211731.jpg,422,600,Anchor Full Cream Milk Powder - 400.00 g,58,384,59,531,0.523697,0.491667,0.772512,0.786667
5,20230915_211733.jpg,426,600,Anchor Full Cream Milk Powder - 400.00 g,77,381,76,526,0.537559,0.501667,0.713615,0.75
7,20230915_211740.jpg,528,600,Anchor Full Cream Milk Powder - 400.00 g,122,437,85,531,0.529356,0.513333,0.596591,0.743333


In [22]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,20230915_211711.jpg,441,600,Anchor Full Cream Milk Powder - 400.00 g,62,424,76,523,0.55102,0.499167,0.820862,0.745
1,20230915_211714.jpg,395,600,Anchor Full Cream Milk Powder - 400.00 g,74,364,49,554,0.55443,0.5025,0.734177,0.841667
6,20230915_211736.jpg,560,600,Anchor Full Cream Milk Powder - 400.00 g,65,455,70,505,0.464286,0.479167,0.696429,0.725
12,20230915_211809.jpg,474,600,Anchor Full Cream Milk Powder - 400.00 g,127,384,80,534,0.53903,0.511667,0.542194,0.756667
17,20230915_211827.jpg,549,600,Anchor Full Cream Milk Powder - 400.00 g,82,532,100,519,0.559199,0.515833,0.819672,0.698333


In [23]:
# Label encoding
def label_encoding(x):
    labels = {'Anchor Full Cream Milk Powder - 400.00 g':0, 'Baby Cheramy Floral Moisturising Soap - 90.00 g':1,
              'Prima Kottu Mee Hot & Spicy Noodles - 80.00 g':2, 'Maliban Chick Bits - 80.00 g':3}
    return labels[x]

In [25]:
train_df.loc[:, 'id'] = train_df['name'].apply(label_encoding)
test_df.loc[:, 'id'] = test_df['name'].apply(label_encoding)

In [26]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
2,20230915_211717.jpg,461,600,Anchor Full Cream Milk Powder - 400.00 g,77,396,70,528,0.513015,0.498333,0.691974,0.763333,0
3,20230915_211728.jpg,492,600,Anchor Full Cream Milk Powder - 400.00 g,82,459,82,518,0.549797,0.5,0.76626,0.726667,0
4,20230915_211731.jpg,422,600,Anchor Full Cream Milk Powder - 400.00 g,58,384,59,531,0.523697,0.491667,0.772512,0.786667,0
5,20230915_211733.jpg,426,600,Anchor Full Cream Milk Powder - 400.00 g,77,381,76,526,0.537559,0.501667,0.713615,0.75,0
7,20230915_211740.jpg,528,600,Anchor Full Cream Milk Powder - 400.00 g,122,437,85,531,0.529356,0.513333,0.596591,0.743333,0
8,20230915_211743.jpg,435,600,Anchor Full Cream Milk Powder - 400.00 g,92,357,75,514,0.516092,0.490833,0.609195,0.731667,0
9,20230915_211748.jpg,382,600,Anchor Full Cream Milk Powder - 400.00 g,86,353,57,529,0.574607,0.488333,0.698953,0.786667,0
10,20230915_211802.jpg,430,600,Anchor Full Cream Milk Powder - 400.00 g,40,401,88,527,0.512791,0.5125,0.839535,0.731667,0
11,20230915_211805.jpg,509,600,Anchor Full Cream Milk Powder - 400.00 g,53,488,93,518,0.531434,0.509167,0.854617,0.708333,0
13,20230915_211812.jpg,400,600,Anchor Full Cream Milk Powder - 400.00 g,132,284,64,544,0.52,0.506667,0.38,0.8,0


In [27]:
import os
from shutil import move

In [28]:
train_folder = 'data_images/train'
test_folder = 'data_images/test'

os.mkdir(train_folder)
os.mkdir(test_folder)

In [29]:
cols = ['filename','id','center_x','center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [30]:
groupby_obj_train.get_group('20230915_211717.jpg').set_index('filename').to_csv('sample.txt',index=False,header=False)

In [31]:
#groupby_obj_train.get_group('img(1).jpg').set_index('filename').to_csv('sample.txt',index=False,header=False)
# save each image in train/test folder and respective labels in .txt
def save_data(filename, folder_path, group_obj):
    # move image
    src = os.path.join('data_images',filename)
    dst = os.path.join(folder_path,filename)
    move(src,dst) # move image to the destination folder

    # save the labels
    text_filename =os.path.join(folder_path,
                                os.path.splitext(filename)[0]+'.txt')
    groupby_obj_train.get_group(filename).set_index('filename').to_csv(text_filename, sep=' ', index=False, header=False)

In [32]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [33]:
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

0       None
1       None
2       None
3       None
4       None
        ... 
3354    None
3355    None
3356    None
3357    None
3358    None
Length: 3359, dtype: object

In [34]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())

In [35]:
for filename in filename_series_test:
    try:
        group = groupby_obj_test.get_group(filename)
        text_filename = os.path.join(test_folder, os.path.splitext(filename)[0] + '.txt')
        group.set_index('filename').to_csv(text_filename, sep=' ', index=False, header=False)
    except KeyError:
        print(f"Warning: File '{filename}' not found in test group.")