In [1]:
import os
from glob import glob 
import pandas as pd 
from xml.etree import ElementTree as et 
from functools import reduce

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [11]:
xmlfiles = glob('./data_images/*.xml')
replace_text = lambda x: x.replace('\\','/')
xmlfiles = list(map(replace_text,xmlfiles))

In [None]:
xmlfiles

In [14]:
tree = et.parse('./data_images/yorktown1._rotated2.xml')
root = tree.getroot()
image_name = root.find('filename').text
print(image_name)

yorktown1._rotated2.jpg


In [15]:

def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()
    image_name = root.find('filename').text
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name,xmin,xmax,ymin,ymax])
        
    return parser

In [16]:
parser_all = list(map(extract_text,xmlfiles))

In [None]:
parser_all

In [17]:
data = reduce(lambda x, y : x+y,parser_all)

In [19]:
df = pd.DataFrame(data,columns = ['filename','width','height','name','xmin','xmax','ymin','ymax'])

In [20]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,anderson1.jpeg,971,882,mound,422,490,424,460
1,anderson1.jpeg,971,882,mound,417,458,465,522
2,anderson1.jpeg,971,882,mound,462,488,463,526
3,anderson1._rotated.jpg,882,971,mound,433,457,493,533
4,anderson1._rotated.jpg,882,971,mound,464,518,514,548


In [21]:
df.shape

(308, 8)

In [22]:
df['name'].value_counts()

mound    308
Name: name, dtype: int64

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308 entries, 0 to 307
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  308 non-null    object
 1   width     308 non-null    object
 2   height    308 non-null    object
 3   name      308 non-null    object
 4   xmin      308 non-null    object
 5   xmax      308 non-null    object
 6   ymin      308 non-null    object
 7   ymax      308 non-null    object
dtypes: object(8)
memory usage: 19.4+ KB


In [24]:
cols = ['width','height','xmin','xmax','ymin','ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308 entries, 0 to 307
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  308 non-null    object
 1   width     308 non-null    int32 
 2   height    308 non-null    int32 
 3   name      308 non-null    object
 4   xmin      308 non-null    int32 
 5   xmax      308 non-null    int32 
 6   ymin      308 non-null    int32 
 7   ymax      308 non-null    int32 
dtypes: int32(6), object(2)
memory usage: 12.2+ KB


In [25]:
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']
df['w'] = (df['xmax']-df['xmin'])/df['width']
df['h'] = (df['ymax']-df['ymin'])/df['height']

In [26]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,anderson1.jpeg,971,882,mound,422,490,424,460,0.469619,0.501134,0.070031,0.040816
1,anderson1.jpeg,971,882,mound,417,458,465,522,0.450566,0.559524,0.042225,0.064626
2,anderson1.jpeg,971,882,mound,462,488,463,526,0.489186,0.560658,0.026777,0.071429
3,anderson1._rotated.jpg,882,971,mound,433,457,493,533,0.504535,0.528321,0.027211,0.041195
4,anderson1._rotated.jpg,882,971,mound,464,518,514,548,0.556689,0.546859,0.061224,0.035015


In [27]:
images = df['filename'].unique()

In [28]:
len(images)

136

In [29]:

img_df = pd.DataFrame(images,columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename']) 

In [30]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename']) # take rest 20% images

In [31]:
len(img_train), len(img_test)

(109, 27)

In [32]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [33]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,anderson1.jpeg,971,882,mound,422,490,424,460,0.469619,0.501134,0.070031,0.040816
1,anderson1.jpeg,971,882,mound,417,458,465,522,0.450566,0.559524,0.042225,0.064626
2,anderson1.jpeg,971,882,mound,462,488,463,526,0.489186,0.560658,0.026777,0.071429
6,anderson1._rotated1.jpg,971,882,mound,485,512,365,422,0.513388,0.446145,0.027806,0.064626
7,anderson1._rotated1.jpg,971,882,mound,518,547,358,422,0.548404,0.442177,0.029866,0.072562


In [34]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
3,anderson1._rotated.jpg,882,971,mound,433,457,493,533,0.504535,0.528321,0.027211,0.041195
4,anderson1._rotated.jpg,882,971,mound,464,518,514,548,0.556689,0.546859,0.061224,0.035015
5,anderson1._rotated.jpg,882,971,mound,462,521,486,512,0.557256,0.513903,0.066893,0.026777
9,anderson1._rotated2.jpg,882,971,mound,424,450,437,478,0.495465,0.471164,0.029478,0.042225
10,anderson1._rotated2.jpg,882,971,mound,364,419,459,487,0.443878,0.487127,0.062358,0.028836


In [35]:
def label_encoding(x):
    labels = {'mound':0}
    return labels[x]

In [36]:
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

In [37]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,anderson1.jpeg,971,882,mound,422,490,424,460,0.469619,0.501134,0.070031,0.040816,0
1,anderson1.jpeg,971,882,mound,417,458,465,522,0.450566,0.559524,0.042225,0.064626,0
2,anderson1.jpeg,971,882,mound,462,488,463,526,0.489186,0.560658,0.026777,0.071429,0
6,anderson1._rotated1.jpg,971,882,mound,485,512,365,422,0.513388,0.446145,0.027806,0.064626,0
7,anderson1._rotated1.jpg,971,882,mound,518,547,358,422,0.548404,0.442177,0.029866,0.072562,0
8,anderson1._rotated1.jpg,971,882,mound,491,537,425,448,0.529351,0.494898,0.047374,0.026077,0
16,anderson2._rotated.jpg,882,970,mound,360,495,427,553,0.484694,0.505155,0.153061,0.129897,0
17,anderson2._rotated.jpg,882,970,mound,486,537,618,662,0.579932,0.659794,0.057823,0.045361,0
18,anderson2._rotated.jpg,882,970,mound,347,424,561,642,0.437075,0.620103,0.087302,0.083505,0
19,anderson2._rotated.jpg,882,970,mound,485,523,418,460,0.571429,0.452577,0.043084,0.043299,0


In [38]:
import os
from shutil import move

In [39]:
train_folder = 'data_images/train'
test_folder = 'data_images/test'


os.mkdir(train_folder)
os.mkdir(test_folder)

In [42]:
cols = ['filename','id','center_x','center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [43]:
def save_data(filename, folder_path, group_obj):
    src = os.path.join('data_images',filename)
    dst = os.path.join(folder_path,filename)
    move(src,dst)
    text_filename = os.path.join(folder_path,
                                 os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False,header=False)
    

In [44]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [45]:
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

0      None
1      None
2      None
3      None
4      None
       ... 
104    None
105    None
106    None
107    None
108    None
Length: 109, dtype: object

In [46]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder,groupby_obj_test))

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
23    None
24    None
25    None
26    None
dtype: object