## Conversion from CSV to XML

In [3]:

import cv2
import random
import pandas as pd
import numpy as np
import json
import csv


import os
from xml.etree.ElementTree import parse, Element, SubElement, ElementTree
import xml.etree.ElementTree as ET
from collections import defaultdict
import shutil

from pathlib import Path


## Preprocessing of OID CSV to Final DataFrame

In [4]:
# Provide relevant paths 
ANNOT_CSV_PATH=r'C:\Users\mpe_03\Desktop\BagDetection\google_OID\csv_folder\train_annotations_bbox.csv'# input path to csv file of entire dataset that comes with downloaded OID images 
IMG_PATH=r'C:\Users\mpe_03\Desktop\BagDetection\google_OID\no_overlap_Images(30_nov_21)' #input image path
CLASS_CSV_PATH=r'C:\Users\mpe_03\Desktop\BagDetection\google_OID\csv_folder\class_descriptions_boxable.csv' #input path to class descriptions csv file

def get_df(ANNOT_CSV_PATH):

    encrypted_strings=[]
    image_names=[]
    height=[]
    width=[]
    classes=[]
    
# Select relevant classes

    classes_df=pd.read_csv(CLASS_CSV_PATH, usecols=[0,1], header=None) #0 - class codename in encrypted string
    oid_classes=list(classes_df[1])
    oid_classes=[item.lower() for item in oid_classes]
    d=dict(classes_df.values)

    labels=['Person','Handbag','Luggage and bags','Backpack']# input the class labels you want

    for v in labels:
        sub_class=classes_df.loc[classes_df[1]==v]
        e=sub_class.iloc[0][0]  
        encrypted_strings.append(e) # getting the list of encrypted codenames in a list
     

    annot_df=pd.read_csv(ANNOT_CSV_PATH, usecols=['ImageID', 'LabelName', 'XMin', 'YMin', 'XMax', 'YMax', 'IsOccluded','IsTruncated'], index_col=False) #check column names         
    sub_annot=annot_df.loc[annot_df['LabelName'].isin(encrypted_strings)]


    for jpg_file in os.listdir(IMG_PATH):
        if not jpg_file.endswith('jpg'):
            continue
        image_name=os.path.join(IMG_PATH,jpg_file)
        jpg_file=jpg_file.split('.')[0]
        image_names.append(jpg_file)
        image=cv2.imread(image_name)
        h,w=image.shape[:2]
        height.append(h)
        width.append(w)

    dims=pd.DataFrame({'ImageID':image_names,
                       'height':height,
                       'width':width})


#Merge all Dataframes and clean them

    df=pd.merge(sub_annot, dims, on='ImageID', how='outer')
    df.dropna(subset=['height','width'], inplace=True)
    df.dropna(subset=['XMin','YMax','XMax','YMin'], inplace=True)

    df['XMin']=(df['XMin']*df['width']).astype('int64')
    df['YMax']=(df['YMax']*df['height']).astype('int64')
    df['XMax']=(df['XMax']*df['width']).astype('int64')
    df['YMin']=(df['YMin']*df['height']).astype('int64')
    df['height']=df['height'].astype('int64')
    df['width']=df['width'].astype('int64')
    
    df['IsOccluded']=df['IsOccluded'].astype('int64')
    df['IsTruncated']=df['IsTruncated'].astype('int64')
    
    df.replace({'LabelName':d}, inplace=True)
    df.columns=df.columns.str.lower()
    df=df[['imageid', 'width','height','labelname', 'xmin', 'ymin', 'xmax', 'ymax', 'isoccluded','istruncated']]
    
    df.to_csv('processed_df.csv', index=0) #input name of processed csv
    return df #check
   
get_df(ANNOT_CSV_PATH)


Unnamed: 0,imageid,width,height,labelname,xmin,ymin,xmax,ymax,isoccluded,istruncated
7,0000071d71a0a6f6,1024,768,Person,0,177,217,766,1,1
8,0000071d71a0a6f6,1024,768,Person,142,192,535,766,1,1
9,0000071d71a0a6f6,1024,768,Person,145,321,240,421,1,0
10,0000071d71a0a6f6,1024,768,Person,610,203,828,766,1,1
11,0000071d71a0a6f6,1024,768,Person,654,186,1022,766,1,1
...,...,...,...,...,...,...,...,...,...,...
1039818,ffc361de5bf50a6c,1024,684,Person,636,277,856,637,1,0
1039819,ffc36b1dcdef2769,1024,768,Luggage and bags,241,526,435,751,1,0
1040281,ffe351691b382ce2,768,1024,Person,1,76,483,993,1,1
1040594,fffb40d02f510b35,738,1024,Person,308,111,621,933,0,0


## Create XML files

In [6]:

oid_xml_folder = "created_oid_xml"

if not os.path.exists(oid_xml_folder):
    os.mkdir(oid_xml_folder)


def write_xml(folder, filename, bbox_list): #create xml frame and assign variables
    root = Element('annotation')
    SubElement(root, 'folder').text = folder
    SubElement(root, 'filename').text = filename.split('.')[0]
    SubElement(root, 'path').text = './images/' +  filename.split('.')[0]
    source = SubElement(root, 'source')
    SubElement(source, 'database').text = 'Unknown'


    # Details from first entry
    e_filename, e_width, e_height, e_LabelName, e_xmin, e_ymin, e_xmax, e_ymax, IsOccluded, IsTruncated = bbox_list[0]
    
    size = SubElement(root, 'size')
    SubElement(size, 'width').text = e_width
    SubElement(size, 'height').text = e_height
    SubElement(size, 'depth').text = '3'

    SubElement(root, 'segmented').text = '0'

    for entry in bbox_list: # Getting values from subsequent entries
        e_filename, e_width, e_height, e_LabelName, e_xmin, e_ymin, e_xmax, e_ymax,  IsOccluded, IsTruncated= entry
        
        obj = SubElement(root, 'object')
        SubElement(obj, 'name').text = e_LabelName
        SubElement(obj, 'pose').text = 'Unspecified'
        SubElement(obj, 'truncated').text = IsTruncated
        SubElement(obj, 'difficult').text = '0'
        SubElement(obj, 'occluded').text = IsOccluded

        bbox = SubElement(obj, 'bndbox')
        SubElement(bbox, 'xmin').text = e_xmin
        SubElement(bbox, 'ymin').text = e_ymin
        SubElement(bbox, 'xmax').text = e_xmax
        SubElement(bbox, 'ymax').text = e_ymax

    #create indented formatting
    tree = ET.ElementTree(root)
    ET.indent(tree, space='\t', level=0)
    
    #write the xml files
    xml_filename = os.path.join('.', folder, os.path.splitext(filename)[0] + '.xml')
    tree.write(xml_filename)
    
    
entries_by_filename = defaultdict(list) #entries_by_filename holds a dict with values that are list

with open('processed_df.csv', 'r', encoding='utf-8') as f_input_csv: #reference processed csv
    csv_input = csv.reader(f_input_csv)
    header = next(csv_input)

    for row in csv_input: #traverse through the rows
        filename, width, height, labelName, xmin, ymin, xmax, ymax,  isoccluded, istruncated = row
        entries_by_filename[filename].append(row) #for whole csv to xml
        
for filename, entries in entries_by_filename.items():
    print(filename, len(entries))
    write_xml(oid_xml_folder, filename, entries) 

0000071d71a0a6f6 5
0005e587bc8f21f7 5
000b29496f75c8e5 1
004358a50958f953 1
005930d0dbdb43e3 1
011b12191c9ee4d6 2
011be1c1a8124e01 1
015ebc34f1fed437 1
0178ad54140c4a13 9
018fc619543be2ec 18
0236efb0a50901e7 1
024d2292afc0c336 4
027082e9bbb0bd7a 1
02e1768fa7ca093c 9
03fbf3b305f8b319 1
04424357410e7e4e 1
045c87545dd0a0aa 1
0597b56c8817d453 4
059a43327ede0b6c 4
05aeb01855a1a529 1
06090a9607cb2d8e 1
0621682b1b5cb92a 11
06ca72bf03a05e54 2
06f48dbb4348b9aa 6
07d70a45845ae57e 2
0803305085dcf3d6 19
08e10dcd1764c2c0 13
0910ebc4415f601d 8
096f9afc571ada8d 18
09851c384498de6b 1
09938cea6d2c5398 1
09e552f73b9cc9c2 21
0a234df550cb0913 3
0a433185aa4d368b 1
0afbc39dd4baa87d 1
0b6cc0230949de4c 5
0b7ba05e84984851 1
0b97da98599bdaf0 1
0bb2bcae95856548 17
0bcf18a63e8225e8 2
0c16c51560b2aed8 4
0c2110f6f74e9b5b 6
0c7be4acc57524a3 2
0d0b9fbeed21f176 1
0d50b5d9f22d2055 1
0d5c7040a26dd273 4
0d6f9cd514ed1e91 3
0d9f8bc3769d222b 2
0eac4d6225e8c59f 2
0ec1684292d2a0f7 1
0f05d91095b58e25 1
0fa4870b2868185f 9
0fd4f

780e536fdcb17f9f 1
7818df7c67f37ff9 1
78237b24b2a1c165 2
782962cb46a4fd3a 1
784810cf852a3e81 1
7866b07607b58805 5
78758183e8960974 5
78758c3932d369f7 1
78a69e0389b575d3 1
78b67703da1373b1 1
78d1823cf1ba4643 1
78eca60fc4639244 3
78f411a90c4e22a0 5
790a507d9a864054 1
79448a42b0f6c944 2
794fe1dcffb6edaf 1
79528fddc54ad3d0 1
79b27a7116aabeda 6
79e58587660fe93a 4
79fc0aea9b7764b4 8
7a4e71e30fa696c6 1
7a54486886893999 1
7a706c34ad2fae80 1
7a823d532e64fce6 1
7a9a664f8ba216e7 3
7a9ce90e3cceab19 2
7a9f72602ec465e7 3
7aab432686f79710 1
7abe761b1988aad6 2
7acb8a54775d2a3d 3
7af5011bfc3cf2d7 2
7b082e15336734b5 1
7b686e06c0322207 6
7bb1e5a0a971e9d7 39
7bb3caefff27618a 40
7c097956fd7dcbd8 3
7c36885ba298afe6 1
7c3d42573792f260 5
7c46a1b0e15588de 1
7c64bb0180ecd958 3
7c7bccba10875d51 1
7c95a45ecb8384fb 1
7c967eafafd1fbe8 4
7c96b3b9a01f278d 1
7cbe120a4eb89c8d 1
7cfc5ae03771483e 6
7d1690c67cb4b38d 1
7d226f4b94bcca8f 3
7d31f99c49cf04b3 4
7d42bdcaea5badf2 2
7d544a911371e7ba 1
7d76db2fb9000d4e 1
7d8cfce8f1

b3db04a1b2fbb0cc 1
b3e18ef7f793b7f2 14
b45ad73945e81667 1
b4aa405f4d13d022 2
b4b2276758fc223f 5
b4e0986bba83b2d4 1
b4ee7d7db1046dac 1
b518538da5e4ba6c 1
b543966490edd38c 1
b5645a054cc2163c 2
b56fcd4f3bb32722 1
b57798ffad9b90cb 2
b59f5dc8bef2055f 2
b5a47e3394c7afaa 2
b5af72e6531bbfd4 1
b5dd7365c130b053 3
b5e5a7b0c0938d9e 1
b60ef65c11244ae6 5
b611b527edb8617d 1
b626c4da439ecca9 1
b67bcb5d6bb56b0d 2
b696dfe8f68dac30 4
b6a154bb47fee2f0 1
b6a6053c67c10fd5 2
b6c13a78031986b3 1
b7008bd9bcc8db96 1
b70aa5f54d62a767 8
b74bb0b79d59b3a5 6
b756432175f0f008 5
b75d12f9666234f7 3
b76117a949574772 1
b7791d50c1328084 2
b7a4d05a46a70835 1
b7af5ad44533c086 3
b7b7b72fb95317f7 1
b7c71b1561c6a0a5 2
b7f49e5b351183f7 1
b811fb49c24c4335 1
b81774416833c9c1 1
b81e29b99a03f121 2
b860d2d8b57b9fd6 8
b874eb10981aa600 1
b8770112d595c1cb 2
b87abddef388c6a9 3
b8a67f8ca54c7e08 2
b8a6b58ba665a9f2 1
b8ca37d77fc88589 1
b8e89d9d1316d655 1
b9118276b5f381ca 1
b93d58845b5d2c65 17
b9563126e19e6605 16
b9e8f7c87b92aa76 2
b9f5865e0

e7acac4af2d607f3 1
e7e290ceee068e36 1
e7f85cf772fe5eb7 1
e8064ee1bac583a3 1
e8274f259aeb148b 31
e83570b0e67c1cc9 1
e91b36a0fd28d694 1
e91d410ff8ae7a9c 1
e949c0d8102165a5 3
e94e789ef0742857 1
e95922da0ddee2b1 5
e9642f70f45669d6 3
e9792923c3b40eeb 1
e990d6e9c2a07cf3 1
e9bd7ee8f31cb0d8 2
e9c7fa10b723921c 4
e9d16dada66a64d1 5
ea2a2ed7e6f6e2b1 2
ea3386cf86cb6f6e 10
ea45781753b22da0 1
ea5806f032b6ccdc 1
ea66455cdd34db61 1
ea83b4b3619e627a 2
ea91c89f207f53e8 1
eaa65165ea18c95a 1
eadf555ae634307c 7
eb1dd2a53861b47b 2
eb5d02f23716b2ea 1
eb5f6ce3c0653453 1
ec0fec93cf6a448b 4
ec24e54bccf501d4 7
ec398e21e01e05f9 1
ec3c343091da3bc6 1
eca1a545766e13ac 2
ecb0862f8b695e1f 2
ecce54c557df6ee9 8
ecdd691ce9584b69 2
ecec0d3e2949d370 8
ecff5e29e0cc7277 1
ed294d1f1ca87a21 2
ed4c863ebca79ee1 3
ed5380b275d7f30f 5
ed60f380eb6ec9dc 4
ed77b4b0fd07d22a 1
ed87dc0510b730e0 3
eda9b2e79c7fef35 3
edd9114c90d658cc 1
ee0179a3c8d57d8b 4
ee08f9f32bab2b79 1
ee30560aa1b6173f 2
ee4ca18eb067881a 12
ee742ef17854a4f8 2
ee888adf3

## THE END