In [1]:
import os 
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et

In [5]:
# Load all xml files and store in a list
xmlfiles = glob('./data_images/*.xml')

# Data cleaning. Replace \\ with /
replace_text = lambda x: x.replace('\\', '/')
xmlfiles = list(map(replace_text, xmlfiles))

In [6]:
xmlfiles

['./data_images/2007_000027.xml',
 './data_images/2007_000032.xml',
 './data_images/2007_000033.xml',
 './data_images/2007_000039.xml',
 './data_images/2007_000042.xml',
 './data_images/2007_000061.xml',
 './data_images/2007_000063.xml',
 './data_images/2007_000068.xml',
 './data_images/2007_000121.xml',
 './data_images/2007_000123.xml',
 './data_images/2007_000129.xml',
 './data_images/2007_000170.xml',
 './data_images/2007_000175.xml',
 './data_images/2007_000187.xml',
 './data_images/2007_000241.xml',
 './data_images/2007_000243.xml',
 './data_images/2007_000250.xml',
 './data_images/2007_000256.xml',
 './data_images/2007_000272.xml',
 './data_images/2007_000323.xml',
 './data_images/2007_000332.xml',
 './data_images/2007_000333.xml',
 './data_images/2007_000346.xml',
 './data_images/2007_000363.xml',
 './data_images/2007_000364.xml',
 './data_images/2007_000392.xml',
 './data_images/2007_000423.xml',
 './data_images/2007_000452.xml',
 './data_images/2007_000464.xml',
 './data_image

In [20]:
# Step 2: Read xml files
# From each xml file we need to extract
# Filename, size(width, height), object(name, xmin, xmax, ymin, ymax)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()
    
    # Extract filename
    image_name = root.find('filename').text
    
    # Width and Height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name, xmin, xmax, ymin, ymax])
        
    return parser

In [21]:
parser_all = list(map(extract_text, xmlfiles))

In [23]:
data = reduce(lambda x, y : x + y, parser_all)

In [24]:
data

[['2007_000027.jpg', '486', '500', 'person', '174', '349', '101', '351'],
 ['2007_000032.jpg', '500', '281', 'aeroplane', '104', '375', '78', '183'],
 ['2007_000032.jpg', '500', '281', 'aeroplane', '133', '197', '88', '123'],
 ['2007_000032.jpg', '500', '281', 'person', '195', '213', '180', '229'],
 ['2007_000032.jpg', '500', '281', 'person', '26', '44', '189', '238'],
 ['2007_000033.jpg', '500', '366', 'aeroplane', '9', '499', '107', '263'],
 ['2007_000033.jpg', '500', '366', 'aeroplane', '421', '482', '200', '226'],
 ['2007_000033.jpg', '500', '366', 'aeroplane', '325', '411', '188', '223'],
 ['2007_000039.jpg', '500', '375', 'tvmonitor', '156', '344', '89', '279'],
 ['2007_000042.jpg', '500', '335', 'train', '263', '500', '32', '295'],
 ['2007_000042.jpg', '500', '335', 'train', '1', '235', '36', '299'],
 ['2007_000061.jpg', '500', '333', 'boat', '274', '437', '11', '279'],
 ['2007_000061.jpg', '500', '333', 'boat', '184', '281', '214', '252'],
 ['2007_000063.jpg', '500', '375', 'do

In [34]:
df = pd.DataFrame(data, columns = ['filename', 'width', 'height', 'name', 'xmin', 'xmax', 'ymin', 'ymax'])

In [35]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,2007_000027.jpg,486,500,person,174,349,101,351
1,2007_000032.jpg,500,281,aeroplane,104,375,78,183
2,2007_000032.jpg,500,281,aeroplane,133,197,88,123
3,2007_000032.jpg,500,281,person,195,213,180,229
4,2007_000032.jpg,500,281,person,26,44,189,238


In [36]:
df.shape

(47468, 8)

In [37]:
df['name'].value_counts()

name
person         24727
chair           3058
car             2492
dog             1598
bottle          1561
cat             1277
bird            1271
pottedplant     1202
sheep           1084
boat            1059
aeroplane       1002
tvmonitor        893
sofa             841
bicycle          837
horse            803
diningtable      802
motorbike        801
cow              771
train            704
bus              685
Name: count, dtype: int64

In [39]:
import numpy as np
# Type conversion
cols = ['width', 'height', 'xmin', 'xmax', 'ymin', 'ymax']
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce').round().astype('int32')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47468 entries, 0 to 47467
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  47468 non-null  object
 1   width     47468 non-null  int32 
 2   height    47468 non-null  int32 
 3   name      47468 non-null  object
 4   xmin      47468 non-null  int32 
 5   xmax      47468 non-null  int32 
 6   ymin      47468 non-null  int32 
 7   ymax      47468 non-null  int32 
dtypes: int32(6), object(2)
memory usage: 1.8+ MB


In [40]:
# Center x, Center y
df['center_x'] = ((df['xmax'] + df['xmin']) / 2) / df['width']
df['center_y'] = ((df['ymax'] + df['ymin']) / 2) / df['height']

# w
df['w'] = (df['xmax'] - df['xmin']) / df['width']

# h
df['h'] = (df['ymax'] - df['ymin']) / df['height']

In [41]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,2007_000027.jpg,486,500,person,174,349,101,351,0.538066,0.452,0.360082,0.5
1,2007_000032.jpg,500,281,aeroplane,104,375,78,183,0.479,0.464413,0.542,0.373665
2,2007_000032.jpg,500,281,aeroplane,133,197,88,123,0.33,0.375445,0.128,0.124555
3,2007_000032.jpg,500,281,person,195,213,180,229,0.408,0.727758,0.036,0.174377
4,2007_000032.jpg,500,281,person,26,44,189,238,0.07,0.759786,0.036,0.174377


### **Split data into train and test**

In [43]:
images = df['filename'].unique()

In [44]:
len(images)

22263

In [46]:
# 80% train and 20% test
img_df = pd.DataFrame(images, columns=['filename'])
img_train = img_df.sample(frac=0.8)['filename'] # shuffle and pick 80% of images

In [47]:
img_train

15449    2011_004715.jpg
9012     2010_001830.jpg
8383     2010_000581.jpg
21965    2012_004034.jpg
18585    2012_000654.jpg
              ...       
3563     2008_005279.jpg
12836    2011_001251.jpg
6898     2009_003033.jpg
10236    2010_004244.jpg
8809     2010_001426.jpg
Name: filename, Length: 17810, dtype: object

In [50]:
# Ensure img_train is a list
img_train_list = list(img_train)

# Filter img_df to get filenames not in img_train_list
img_test = tuple(img_df.loc[~img_df['filename'].isin(img_train_list), 'filename'])

In [51]:
len(img_train), len(img_test)

(17810, 4453)

In [53]:
# Ensure img_train and img_test are lists
img_train_list = list(img_train)
img_test_list = list(img_test)

# Filter the DataFrame
train_df = df[df['filename'].isin(img_train_list)]
test_df = df[df['filename'].isin(img_test_list)]

In [56]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
1,2007_000032.jpg,500,281,aeroplane,104,375,78,183,0.479,0.464413,0.542,0.373665
2,2007_000032.jpg,500,281,aeroplane,133,197,88,123,0.33,0.375445,0.128,0.124555
3,2007_000032.jpg,500,281,person,195,213,180,229,0.408,0.727758,0.036,0.174377
4,2007_000032.jpg,500,281,person,26,44,189,238,0.07,0.759786,0.036,0.174377
5,2007_000033.jpg,500,366,aeroplane,9,499,107,263,0.508,0.505464,0.98,0.42623


In [57]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,2007_000027.jpg,486,500,person,174,349,101,351,0.538066,0.452,0.360082,0.5
13,2007_000063.jpg,500,375,dog,123,379,115,275,0.502,0.52,0.512,0.426667
14,2007_000063.jpg,500,375,chair,75,428,1,375,0.503,0.501333,0.706,0.997333
16,2007_000121.jpg,500,375,tvmonitor,251,475,28,267,0.726,0.393333,0.448,0.637333
17,2007_000121.jpg,500,375,tvmonitor,22,251,28,273,0.273,0.401333,0.458,0.653333


### **Assign id number to object names**

In [58]:
# Label encoding
def label_encoding(x):
    labels = {
        'person': 0,
        'chair': 1,
        'car': 2,
        'dog': 3,
        'bottle': 4,
        'cat': 5,
        'bird': 6,
        'pottedplant': 7,
        'sheep': 8,
        'boat': 9,
        'aeroplane': 10,
        'tvmonitor': 11,
        'sofa': 12,
        'bicycle': 13,
        'horse': 14,
        'diningtable': 15,
        'motorbike': 16,
        'cow': 17,
        'train': 18,
        'bus': 19
    }
    return labels[x]

In [59]:
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['id'] = train_df['name'].apply(label_encoding)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['id'] = test_df['name'].apply(label_encoding)


In [60]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
1,2007_000032.jpg,500,281,aeroplane,104,375,78,183,0.479,0.464413,0.542,0.373665,10
2,2007_000032.jpg,500,281,aeroplane,133,197,88,123,0.33,0.375445,0.128,0.124555,10
3,2007_000032.jpg,500,281,person,195,213,180,229,0.408,0.727758,0.036,0.174377,0
4,2007_000032.jpg,500,281,person,26,44,189,238,0.07,0.759786,0.036,0.174377,0
5,2007_000033.jpg,500,366,aeroplane,9,499,107,263,0.508,0.505464,0.98,0.42623,10


### **Save Image and Labels in text**

In [181]:
import os
from shutil import move

In [182]:
train_folder = 'data_images/train'
test_folder = 'data_images/test'

os.mkdir(train_folder)
os.mkdir(test_folder)

In [183]:
cols = ['filename', 'id', 'center_x', 'center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [184]:
def save_data(filename, folder_path, group_obj):
    #move image
    src = os.path.join('data_images', filename)
    dst = os.path.join(folder_path, filename)
    move(src, dst) #moving image to destination folder

    #save labels
    text_filename = os.path.join(folder_path, os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename, sep=' ', index=False, header=False)

In [185]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [186]:
filename_series.apply(save_data, args=(train_folder, groupby_obj_train))

0        None
1        None
2        None
3        None
4        None
         ... 
17805    None
17806    None
17807    None
17808    None
17809    None
Length: 17810, dtype: object

In [187]:
filename_series = pd.Series(groupby_obj_test.groups.keys())
filename_series.apply(save_data, args=(test_folder, groupby_obj_test))

0       None
1       None
2       None
3       None
4       None
        ... 
4448    None
4449    None
4450    None
4451    None
4452    None
Length: 4453, dtype: object