In [15]:
import os
from glob import glob # extract path of each file
import pandas as pd # data preprocessing
from functools import reduce
from xml.etree import ElementTree as et # parse information from XML

In [17]:
# Load all xml files and store in a list
xmlfiles = glob('./data_images/*.xml')

# Data cleaning.. replace \\ with /
xmlfiles = list(map(lambda x: x.replace("\\", "/"), xmlfiles))

In [18]:
xmlfiles

['./data_images/000001.xml',
 './data_images/000002.xml',
 './data_images/000007.xml',
 './data_images/000009.xml',
 './data_images/000012.xml',
 './data_images/000016.xml',
 './data_images/000017.xml',
 './data_images/000019.xml',
 './data_images/000020.xml',
 './data_images/000021.xml',
 './data_images/000023.xml',
 './data_images/000024.xml',
 './data_images/000026.xml',
 './data_images/000030.xml',
 './data_images/000032.xml',
 './data_images/000033.xml',
 './data_images/000034.xml',
 './data_images/000035.xml',
 './data_images/000036.xml',
 './data_images/000039.xml',
 './data_images/000041.xml',
 './data_images/000042.xml',
 './data_images/000044.xml',
 './data_images/000046.xml',
 './data_images/000047.xml',
 './data_images/000048.xml',
 './data_images/000050.xml',
 './data_images/000051.xml',
 './data_images/000052.xml',
 './data_images/000060.xml',
 './data_images/000061.xml',
 './data_images/000063.xml',
 './data_images/000064.xml',
 './data_images/000065.xml',
 './data_image

In [19]:
# Step-2: read xml files
# from each xml file we need to extract
# filename, size(width, height), object(name, xmin, xmax, ymin, ymax)
def extrat_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    # extract filename
    image_name = root.find("filename").text

    # width and height of the image
    width = root.find("size").find("width").text
    height = root.find("size").find("height").text
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find("bndbox")
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name, xmin, xmax, ymin, ymax])

    return parser    

In [27]:
parser_all = list(map(extrat_text, xmlfiles))

In [28]:
data = reduce(lambda x, y: x+y, parser_all)

In [29]:
df = pd.DataFrame(data, columns = ["filename", "width", "height", "name", "xmin", "xmax", "ymin", "ymax"])

In [30]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,000001.jpg,1024,657,car,14,301,335,522
1,000001.jpg,1024,657,car,269,571,345,489
2,000001.jpg,1024,657,car,502,798,342,450
3,000001.jpg,1024,657,car,709,1009,333,438
4,000002.jpg,800,600,car,41,768,240,497


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15663 entries, 0 to 15662
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  15663 non-null  object
 1   width     15663 non-null  object
 2   height    15663 non-null  object
 3   name      15663 non-null  object
 4   xmin      15663 non-null  object
 5   xmax      15663 non-null  object
 6   ymin      15663 non-null  object
 7   ymax      15663 non-null  object
dtypes: object(8)
memory usage: 979.1+ KB


In [32]:
cols = ["width", "height", "xmin", "xmax", "ymin", "ymax"]
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15663 entries, 0 to 15662
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  15663 non-null  object
 1   width     15663 non-null  int32 
 2   height    15663 non-null  int32 
 3   name      15663 non-null  object
 4   xmin      15663 non-null  int32 
 5   xmax      15663 non-null  int32 
 6   ymin      15663 non-null  int32 
 7   ymax      15663 non-null  int32 
dtypes: int32(6), object(2)
memory usage: 612.0+ KB


In [33]:
# center x, center y
df['center_x'] = ((df["xmax"]+df["xmin"])/2)/df["width"]
df['center_y'] = ((df["ymax"]+df["ymin"])/2)/df["width"]

# w
df["w"] = (df["xmax"]-df["xmin"])/df["width"]

# h
df["h"] = (df["xmax"]-df["xmin"])/df["height"]

In [34]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,000001.jpg,1024,657,car,14,301,335,522,0.153809,0.418457,0.280273,0.436834
1,000001.jpg,1024,657,car,269,571,345,489,0.410156,0.407227,0.294922,0.459665
2,000001.jpg,1024,657,car,502,798,342,450,0.634766,0.386719,0.289062,0.450533
3,000001.jpg,1024,657,car,709,1009,333,438,0.838867,0.376465,0.292969,0.456621
4,000002.jpg,800,600,car,41,768,240,497,0.505625,0.460625,0.90875,1.211667


In [35]:
images = df["filename"].unique()

In [36]:
len(images)

5012

In [37]:
# 80% train and 20% test
img_df = pd.DataFrame(images, columns=["filename"])
img_train = tuple(img_df.sample(frac=0.8)['filename']) # shuffle and pick 80% of images

In [38]:
img_test = tuple(img_df.query(f"filename not in {img_train}")["filename"]) # take rest 20% images

In [39]:
len(img_train), len(img_test)

(4010, 1002)

In [40]:
train_df = df.query(f"filename in {img_train}")
test_df = df.query(f"filename in {img_test}")

In [41]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
4,000002.jpg,800,600,car,41,768,240,497,0.505625,0.460625,0.90875,1.211667
5,000002.jpg,800,600,car,533,722,236,299,0.784375,0.334375,0.23625,0.315
6,000007.jpg,500,333,car,141,500,50,330,0.641,0.38,0.718,1.078078
7,000009.jpg,500,375,horse,69,270,172,330,0.339,0.502,0.402,0.536
8,000009.jpg,500,375,person,150,229,141,284,0.379,0.425,0.158,0.210667


In [42]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,000001.jpg,1024,657,car,14,301,335,522,0.153809,0.418457,0.280273,0.436834
1,000001.jpg,1024,657,car,269,571,345,489,0.410156,0.407227,0.294922,0.459665
2,000001.jpg,1024,657,car,502,798,342,450,0.634766,0.386719,0.289062,0.450533
3,000001.jpg,1024,657,car,709,1009,333,438,0.838867,0.376465,0.292969,0.456621
12,000016.jpg,334,500,bicycle,92,305,72,473,0.594311,0.815868,0.637725,0.426


In [43]:
# Label encoding
def label_encoding(x):
    labels = {"person":0, "car":1, "chair":2, "bottle":3, "pottedplant":4, "bird":5, "dog":6,
             "sofa":7, "bicycle":8, "horse":9, "boat":10, "motorbike":11, "cat":12, "tvmonitor":13,
             "cow":14, "sheep":15, "aeroplane":16, "train":17, "diningtable":18, "bus":19}
    return labels[x]

In [44]:
train_df["id"] = train_df["name"].apply(label_encoding)
test_df["id"] = test_df["name"].apply(label_encoding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df["id"] = train_df["name"].apply(label_encoding)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["id"] = test_df["name"].apply(label_encoding)


In [45]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
4,000002.jpg,800,600,car,41,768,240,497,0.505625,0.460625,0.90875,1.211667,1
5,000002.jpg,800,600,car,533,722,236,299,0.784375,0.334375,0.23625,0.315,1
6,000007.jpg,500,333,car,141,500,50,330,0.641,0.38,0.718,1.078078,1
7,000009.jpg,500,375,horse,69,270,172,330,0.339,0.502,0.402,0.536,9
8,000009.jpg,500,375,person,150,229,141,284,0.379,0.425,0.158,0.210667,0
9,000009.jpg,500,375,person,285,327,201,331,0.612,0.532,0.084,0.112,0
10,000009.jpg,500,375,person,258,297,198,329,0.555,0.527,0.078,0.104,0
11,000012.jpg,500,333,car,156,351,97,270,0.507,0.367,0.39,0.585586,1
13,000017.jpg,480,364,person,185,279,62,199,0.483333,0.271875,0.195833,0.258242,0
14,000017.jpg,480,364,horse,90,403,78,336,0.513542,0.43125,0.652083,0.85989,9


In [46]:
import os
from shutil import move

In [47]:
train_folder = "data_images/train"
test_folder = "data_images/test"

os.mkdir(train_folder)
os.mkdir(test_folder)

FileExistsError: [WinError 183] Halen varolan bir dosya oluşturulamaz: 'data_images/train'

In [48]:
cols = ["filename", "id", "center_x", "center_y", "w", "h"]
groupby_obj_train = train_df[cols].groupby("filename")
groupby_obj_test = test_df[cols].groupby("filename")

In [62]:
# Save each image in train/test folder and repective labels in .txt
def save_data(filename, folder_path, group_obj):
    # move image
    src = os.path.join("data_images", filename)
    dst = os.path.join(folder_path, filename)
    move(src, dst) # move image to destination folder
    
    # save the labels
    text_filename = os.path.join(folder_path,
                                 os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename, sep=' ', index=False, header=False)

In [63]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [64]:
filename_series

0       000002.jpg
1       000007.jpg
2       000009.jpg
3       000012.jpg
4       000017.jpg
           ...    
4005    009949.jpg
4006    009950.jpg
4007    009954.jpg
4008    009955.jpg
4009    009959.jpg
Length: 4010, dtype: object

In [65]:
filename_series.apply(save_data, args=(train_folder, groupby_obj_train))

0       None
1       None
2       None
3       None
4       None
        ... 
4005    None
4006    None
4007    None
4008    None
4009    None
Length: 4010, dtype: object

In [66]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())

In [67]:
filename_series_test

0       000001.jpg
1       000016.jpg
2       000030.jpg
3       000044.jpg
4       000047.jpg
           ...    
997     009938.jpg
998     009940.jpg
999     009946.jpg
1000    009958.jpg
1001    009961.jpg
Length: 1002, dtype: object

In [68]:
filename_series_test.apply(save_data, args=(test_folder, groupby_obj_test))

0       None
1       None
2       None
3       None
4       None
        ... 
997     None
998     None
999     None
1000    None
1001    None
Length: 1002, dtype: object