In [7]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et

In [12]:
# Load all xml files and store in a list
xml_list = glob('./data_images/*.xml')

# Data cleaning.. replace \\ with /
xml_list = list(map(lambda x: x.replace("\\", "/"), xmlfiles))

In [13]:
xmlfiles

['./data_images/aeroplane-16749_1280.xml',
 './data_images/aeroplane-1838708_1280.xml',
 './data_images/aeroplane-4347341_1280.xml',
 './data_images/aeroplane-93499_1280.xml',
 './data_images/air-84603_1280.xml',
 './data_images/air-force-2178863_1280.xml',
 './data_images/air-force-438465_1280.xml',
 './data_images/airbus-2178588_1280.xml',
 './data_images/airbus-2466266_1280.xml',
 './data_images/airbus-4454338_1280.xml',
 './data_images/aircraft-4885805_1280.xml',
 './data_images/aircraft-5572156_1280.xml',
 './data_images/aircraft-994948_1280.xml',
 './data_images/aircraft-take-off-123028_1280.xml',
 './data_images/airline-2908745_1280.xml',
 './data_images/airliner-2545346_1280.xml',
 './data_images/airplane-1155134_1280.xml',
 './data_images/airplane-175595_1280.xml',
 './data_images/airplane-2315427_1280.xml',
 './data_images/airplane-2726633_1280.xml',
 './data_images/airplane-2745898_1280.xml',
 './data_images/airplane-316716_1280.xml',
 './data_images/airplane-5532542_1280.xm

In [14]:
# Step-2: read xml files
# from each xml file we need to extract
# filename, size(width, height), object(name, xmin, xmax, ymin, ymax)
def extrat_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    # extract filename
    image_name = root.find("filename").text

    # width and height of the image
    width = root.find("size").find("width").text
    height = root.find("size").find("height").text
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find("bndbox")
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name, xmin, xmax, ymin, ymax])

    return parser  

In [15]:
parser_all = list(map(extrat_text, xml_list))

In [16]:
data = reduce(lambda x, y: x+y, parser_all)

In [17]:
df = pd.DataFrame(data, columns = ["filename", "width", "height", "name", "xmin", "xmax", "ymin", "ymax"])

In [18]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,aeroplane-16749_1280.jpg,1280,853,plane,385,980,348,665
1,aeroplane-1838708_1280.jpg,1280,720,plane,2,1220,199,556
2,aeroplane-4347341_1280.jpg,1280,853,plane,213,1105,117,341
3,aeroplane-93499_1280.jpg,1280,840,plane,32,1133,263,592
4,air-84603_1280.jpg,1280,853,plane,400,879,495,690


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1287 entries, 0 to 1286
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  1287 non-null   object
 1   width     1287 non-null   object
 2   height    1287 non-null   object
 3   name      1287 non-null   object
 4   xmin      1287 non-null   object
 5   xmax      1287 non-null   object
 6   ymin      1287 non-null   object
 7   ymax      1287 non-null   object
dtypes: object(8)
memory usage: 80.6+ KB


In [20]:
cols = ["width", "height", "xmin", "xmax", "ymin", "ymax"]
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1287 entries, 0 to 1286
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  1287 non-null   object
 1   width     1287 non-null   int32 
 2   height    1287 non-null   int32 
 3   name      1287 non-null   object
 4   xmin      1287 non-null   int32 
 5   xmax      1287 non-null   int32 
 6   ymin      1287 non-null   int32 
 7   ymax      1287 non-null   int32 
dtypes: int32(6), object(2)
memory usage: 50.4+ KB


In [21]:
# center_x, center_y, width, height
df["center_x"] = (df["xmin"] + df["xmax"])/(2*df["width"])
df["center_y"] = (df["ymin"] + df["ymax"])/(2*df["height"])

df["bb_width"] = (df["xmax"] - df["xmin"])/df["width"]
df["bb_height"] = (df["ymax"] - df["ymin"])/df["height"]

In [22]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,bb_width,bb_height
0,aeroplane-16749_1280.jpg,1280,853,plane,385,980,348,665,0.533203,0.593787,0.464844,0.37163
1,aeroplane-1838708_1280.jpg,1280,720,plane,2,1220,199,556,0.477344,0.524306,0.951562,0.495833
2,aeroplane-4347341_1280.jpg,1280,853,plane,213,1105,117,341,0.514844,0.268464,0.696875,0.262603
3,aeroplane-93499_1280.jpg,1280,840,plane,32,1133,263,592,0.455078,0.508929,0.860156,0.391667
4,air-84603_1280.jpg,1280,853,plane,400,879,495,690,0.499609,0.694607,0.374219,0.228605


In [24]:
images = df["filename"].unique()

In [25]:
# 80% train and 20% test
img_df = pd.DataFrame(images, columns=["filename"])
img_train = tuple(img_df.sample(frac=0.8)['filename']) # shuffle and pick 80% of images

In [26]:
img_test = tuple(img_df.query(f"filename not in {img_train}")["filename"]) # take rest 20% images

In [27]:
len(img_train), len(img_test)

(632, 158)

In [28]:
train_df = df.query(f"filename in {img_train}")
test_df = df.query(f"filename in {img_test}")  

In [29]:
len(img_train), len(img_test)

(632, 158)

In [30]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,bb_width,bb_height
2,aeroplane-4347341_1280.jpg,1280,853,plane,213,1105,117,341,0.514844,0.268464,0.696875,0.262603
4,air-84603_1280.jpg,1280,853,plane,400,879,495,690,0.499609,0.694607,0.374219,0.228605
5,air-force-2178863_1280.jpg,1280,720,plane,36,1280,68,626,0.514062,0.481944,0.971875,0.775
6,air-force-438465_1280.jpg,1280,853,plane,163,1112,235,596,0.498047,0.487104,0.741406,0.423212
7,pexels-max-walter-4841667.jpg,3712,5568,plane,1249,3063,2327,2912,0.580819,0.470456,0.488685,0.105065


In [31]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,bb_width,bb_height
0,aeroplane-16749_1280.jpg,1280,853,plane,385,980,348,665,0.533203,0.593787,0.464844,0.37163
1,aeroplane-1838708_1280.jpg,1280,720,plane,2,1220,199,556,0.477344,0.524306,0.951562,0.495833
3,aeroplane-93499_1280.jpg,1280,840,plane,32,1133,263,592,0.455078,0.508929,0.860156,0.391667
12,aircraft-994948_1280.jpg,1280,670,plane,93,1104,184,534,0.467578,0.535821,0.789844,0.522388
15,airliner-2545346_1280.png,1280,497,plane,15,1280,3,471,0.505859,0.476861,0.988281,0.94165


In [32]:
# Label encoding
def label_encoding(x):
    labels = {"plane":0}
    return labels[x]

In [33]:
train_df["id"] = train_df["name"].apply(label_encoding)
test_df["id"] = test_df["name"].apply(label_encoding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df["id"] = train_df["name"].apply(label_encoding)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["id"] = test_df["name"].apply(label_encoding)


In [34]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,bb_width,bb_height,id
2,aeroplane-4347341_1280.jpg,1280,853,plane,213,1105,117,341,0.514844,0.268464,0.696875,0.262603,0
4,air-84603_1280.jpg,1280,853,plane,400,879,495,690,0.499609,0.694607,0.374219,0.228605,0
5,air-force-2178863_1280.jpg,1280,720,plane,36,1280,68,626,0.514062,0.481944,0.971875,0.775,0
6,air-force-438465_1280.jpg,1280,853,plane,163,1112,235,596,0.498047,0.487104,0.741406,0.423212,0
7,pexels-max-walter-4841667.jpg,3712,5568,plane,1249,3063,2327,2912,0.580819,0.470456,0.488685,0.105065,0
8,airbus-2466266_1280.jpg,1280,860,plane,1,1028,311,599,0.401953,0.52907,0.802344,0.334884,0
9,airbus-4454338_1280.jpg,1280,853,plane,1,1280,60,729,0.500391,0.462485,0.999219,0.784291,0
10,aircraft-4885805_1280.jpg,1280,853,plane,265,747,394,525,0.395313,0.538687,0.376563,0.153576,0
11,aircraft-5572156_1280.png,1280,720,plane,58,1250,200,521,0.510938,0.500694,0.93125,0.445833,0
13,aircraft-take-off-123028_1280.jpg,1280,853,plane,281,1273,209,736,0.607031,0.553927,0.775,0.617819,0


In [35]:
import os
from shutil import move

In [36]:
train_folder = "data_images/train"
test_folder = "data_images/test"

os.mkdir(train_folder)
os.mkdir(test_folder)

In [37]:
cols = ["filename", "id", "center_x", "center_y", "bb_width", "bb_height"]
groupby_obj_train = train_df[cols].groupby("filename")
groupby_obj_test = test_df[cols].groupby("filename")

In [38]:
# Save each image in train/test folder and repective labels in .txt
def save_data(filename, folder_path, group_obj):
    # move image
    src = os.path.join("data_images", filename)
    dst = os.path.join(folder_path, filename)
    move(src, dst) # move image to destination folder
    
    # save the labels
    text_filename = os.path.join(folder_path,
                                 os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename, sep=' ', index=False, header=False)

In [39]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [40]:
filename_series

0               aeroplane-4347341_1280.jpg
1                       air-84603_1280.jpg
2               air-force-2178863_1280.jpg
3                air-force-438465_1280.jpg
4                  airbus-2466266_1280.jpg
                      ...                 
627                travel-1038110_1280.jpg
628                   u-2-1747353_1280.jpg
629             vacations-1560815_1280.jpg
630                virgin-2721333_1280.jpg
631    yeyo-salas-SoocKX8NAWQ-unsplash.jpg
Length: 632, dtype: object

In [41]:
filename_series.apply(save_data, args=(train_folder, groupby_obj_train))

0      None
1      None
2      None
3      None
4      None
       ... 
627    None
628    None
629    None
630    None
631    None
Length: 632, dtype: object

In [42]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())

In [43]:
filename_series_test

0                aeroplane-16749_1280.jpg
1              aeroplane-1838708_1280.jpg
2                aeroplane-93499_1280.jpg
3                aircraft-994948_1280.jpg
4               airliner-2545346_1280.png
                      ...                
153            sukhoi-35-1880484_1280.jpg
154            to-travel-4022443_1280.jpg
155               travel-1038105_1280.jpg
156      tricolor-arrows-2010752_1280.jpg
157    vista-wei-LXWnLkS-UyY-unsplash.jpg
Length: 158, dtype: object

In [44]:
filename_series_test.apply(save_data, args=(test_folder, groupby_obj_test))

0      None
1      None
2      None
3      None
4      None
       ... 
153    None
154    None
155    None
156    None
157    None
Length: 158, dtype: object