In [1]:
import os
import glob
import pandas as pd
import xml.etree.ElementTree as ET
import time

#### • Convert xmls to pandas dataframe

In [2]:
def xml_to_df(path, subdirectories = True):
    xml_list = []
    if subdirectories:
        classes = os.listdir(path)
        for cls in classes:
            for xml_file in glob.glob(os.path.join(path, cls) + '/*.xml'):
                tree = ET.parse(xml_file)
                root = tree.getroot()
                for member in root.findall('object'):
                    value = (root.find('filename').text,
                            int(root.find('size')[0].text),
                            int(root.find('size')[1].text),
                            member[0].text,
                            int(member[4][0].text),
                            int(member[4][1].text),
                            int(member[4][2].text),
                            int(member[4][3].text)
                            )
                    xml_list.append(value)
        column_name = ['filename', 'width', 'height', 'classes', 'xmin', 'ymin', 'xmax', 'ymax']
        xml_df = pd.DataFrame(xml_list, columns=column_name)
    else:
        for xml_file in glob.glob(path + '/*.xml'):
            tree = ET.parse(xml_file)
            root = tree.getroot()
            for member in root.findall('object'):
                value = (root.find('filename').text,
                        int(root.find('size')[0].text),
                        int(root.find('size')[1].text),
                        member[0].text,
                        int(member[4][0].text),
                        int(member[4][1].text),
                        int(member[4][2].text),
                        int(member[4][3].text)
                        )
                xml_list.append(value)
        column_name = ['filename', 'width', 'height', 'classes', 'xmin', 'ymin', 'xmax', 'ymax']
        xml_df = pd.DataFrame(xml_list, columns=column_name)

    return xml_df

##### • Input path to directory containing subdirectory of images

In [3]:
src_path= os.path.join(os.getcwd(),'food_data')
df_xml = xml_to_df(src_path, subdirectories = False)

In [4]:
df_xml.head(7)

Unnamed: 0,filename,width,height,classes,xmin,ymin,xmax,ymax
0,100576.jpg,384,512,donuts,27,192,279,464
1,100576.jpg,384,512,donuts,129,7,339,213
2,100786.jpg,382,512,donuts,30,127,356,418
3,101905.jpg,512,384,donuts,112,21,249,151
4,101905.jpg,512,384,donuts,247,28,384,153
5,101905.jpg,512,384,donuts,212,153,370,293
6,101905.jpg,512,384,donuts,371,163,512,345


#####  • Address of gcloud bucket storing images

In [5]:
gbucket_path = 'gs://your_gcloud_bucket_name/food_data/'

In [6]:
df_xml['filename']= df_xml.filename.apply(lambda x: os.path.join(gbucket_path, x))#gbucket_path+ x)
df_xml.head(6)

Unnamed: 0,filename,width,height,classes,xmin,ymin,xmax,ymax
0,gs://your_gcloud_bucket_name/food_data/100576.jpg,384,512,donuts,27,192,279,464
1,gs://your_gcloud_bucket_name/food_data/100576.jpg,384,512,donuts,129,7,339,213
2,gs://your_gcloud_bucket_name/food_data/100786.jpg,382,512,donuts,30,127,356,418
3,gs://your_gcloud_bucket_name/food_data/101905.jpg,512,384,donuts,112,21,249,151
4,gs://your_gcloud_bucket_name/food_data/101905.jpg,512,384,donuts,247,28,384,153
5,gs://your_gcloud_bucket_name/food_data/101905.jpg,512,384,donuts,212,153,370,293


In [7]:
df_xml.filename[0]

'gs://your_gcloud_bucket_name/food_data/100576.jpg'

In [8]:
cols_req = ['filename','xmin', 'ymin', 'xmax', 'ymax','classes']
df_txt = pd.DataFrame()    
df_txt = df_xml[cols_req].apply(lambda x : ','.join(x.astype(str)),axis=1)
df_txt.head(5)

0    gs://your_gcloud_bucket_name/food_data/100576....
1    gs://your_gcloud_bucket_name/food_data/100576....
2    gs://your_gcloud_bucket_name/food_data/100786....
3    gs://your_gcloud_bucket_name/food_data/101905....
4    gs://your_gcloud_bucket_name/food_data/101905....
dtype: object

In [9]:
df_txt[0]

'gs://your_gcloud_bucket_name/food_data/100576.jpg,27,192,279,464,donuts'

##### • Download df_txt as annotation.txt maybe?

In [28]:
dest_path=os.path.join(os.getcwd(),'annotations.txt')
df_txt.to_csv(dest_path, header= None, index=None,sep=' ')

###  ◘ Or Alternatively
####  [ A ] Modify older annotations.
#### [ B ] Parse CSV




#####  [A] Modifying annotations generated previously for local use.

In [10]:
train = pd.read_csv("annotations.txt", names=['filename', 'xmin','ymin','xmax','ymax','classes'])
train.head(5)

Unnamed: 0,filename,xmin,ymin,xmax,ymax,classes
0,food_data\cake\106496.jpg,164,29,487,323,cake
1,food_data\cake\106501.jpg,42,259,251,465,cake
2,food_data\cake\106501.jpg,139,120,307,298,cake
3,food_data\cake\110954.jpg,15,80,367,460,cake
4,food_data\cake\110954.jpg,1,82,111,288,cake


In [11]:
gbucket_path = 'gs://your_gcloud_bucket_name/food_data/'

train['filename']= train.filename.apply(lambda x: os.path.join(gbucket_path, x).replace('\\', '/'))
train.head(7)

Unnamed: 0,filename,xmin,ymin,xmax,ymax,classes
0,gs://your_gcloud_bucket_name/food_data/food_da...,164,29,487,323,cake
1,gs://your_gcloud_bucket_name/food_data/food_da...,42,259,251,465,cake
2,gs://your_gcloud_bucket_name/food_data/food_da...,139,120,307,298,cake
3,gs://your_gcloud_bucket_name/food_data/food_da...,15,80,367,460,cake
4,gs://your_gcloud_bucket_name/food_data/food_da...,1,82,111,288,cake
5,gs://your_gcloud_bucket_name/food_data/food_da...,30,26,427,384,cake
6,gs://your_gcloud_bucket_name/food_data/food_da...,53,172,320,466,cake


In [12]:
train.filename[0]

'gs://your_gcloud_bucket_name/food_data/food_data/cake/106496.jpg'

In [13]:
data = pd.DataFrame()
data['format']= train[train.columns].apply(lambda x: ','.join(x.astype(str)), axis=1)
print(data.format[0],'\n'+ data.format[770])

#data.to_csv('annotations.txt', header=None, index=None, sep=' ')

gs://your_gcloud_bucket_name/food_data/food_data/cake/106496.jpg,164,29,487,323,cake 
gs://your_gcloud_bucket_name/food_data/food_data/dosa/aug_91_Copy.jpg,169,228,1369,1985,dosa


#### [B] Parse CSV file generated by converting xmls

In [17]:
train = pd.read_csv("food_dataset.csv")
train.head(5)

Unnamed: 0,filename,width,height,classes,xmin,ymin,xmax,ymax
0,106496.jpg,512,343,cake,164,29,487,323
1,106501.jpg,382,512,cake,42,259,251,465
2,106501.jpg,382,512,cake,139,120,307,298
3,110954.jpg,384,512,cake,15,80,367,460
4,110954.jpg,384,512,cake,1,82,111,288


In [18]:
train['classes'].value_counts()

cake      439
donuts    218
dosa      119
Name: classes, dtype: int64

#### • Prefixing a common bucket folder path for cloud training

In [19]:
gbucket_path = 'gs://your_gcloud_bucket_name/food_data/'

train['filename']= train['filename'].apply(lambda x: gbucket_path +x)#prefixing a common bucket folder path for cloud training
train.head(5)

Unnamed: 0,filename,width,height,classes,xmin,ymin,xmax,ymax
0,gs://your_gcloud_bucket_name/food_data/106496.jpg,512,343,cake,164,29,487,323
1,gs://your_gcloud_bucket_name/food_data/106501.jpg,382,512,cake,42,259,251,465
2,gs://your_gcloud_bucket_name/food_data/106501.jpg,382,512,cake,139,120,307,298
3,gs://your_gcloud_bucket_name/food_data/110954.jpg,384,512,cake,15,80,367,460
4,gs://your_gcloud_bucket_name/food_data/110954.jpg,384,512,cake,1,82,111,288


In [20]:
print(list(train.columns))

['filename', 'width', 'height', 'classes', 'xmin', 'ymin', 'xmax', 'ymax']


#### • Re-arraging columns & dropping irrelavant ones to obtain column order as required

In [21]:
train = train[['filename', 'xmin', 'ymin', 'xmax', 'ymax', 'classes']]#re-arraging the column order as required & dropping irrelavant columns of width and height
train.head(4)

Unnamed: 0,filename,xmin,ymin,xmax,ymax,classes
0,gs://your_gcloud_bucket_name/food_data/106496.jpg,164,29,487,323,cake
1,gs://your_gcloud_bucket_name/food_data/106501.jpg,42,259,251,465,cake
2,gs://your_gcloud_bucket_name/food_data/106501.jpg,139,120,307,298,cake
3,gs://your_gcloud_bucket_name/food_data/110954.jpg,15,80,367,460,cake


#### Concatenating columns of _train_ to a new dataframe _data_; And saving as annotations text file. .

In [22]:
data = pd.DataFrame()
data['format']= train[train.columns].apply(lambda x: ','.join(x.astype(str)), axis=1)
print(data.format[0], '\n', data.format[770])

#data.to_csv('annotations.txt', header=None, index=None, sep=' ')

gs://your_gcloud_bucket_name/food_data/106496.jpg,164,29,487,323,cake 
 gs://your_gcloud_bucket_name/food_data/aug_91.jpg,8,2,1039,1199,dosa


## ◘ Also
#### Save above dataframe as a pickled dictionary.

In [26]:
import pickle

with open('data.pickle','wb') as file:
    #pickle.dump(data['format'], file)#pickling above dataframe.format as dataframe
    pickle.dump(dict(data['format']), file)#pickling above dataframe.format as dictionary
    
file.close()

##### • test reading the pickled file

In [27]:
with open('data.pickle','rb') as file:
    data_dict= pickle.load(file)

file.close()
print(data_dict)

{0: 'gs://your_gcloud_bucket_name/food_data/106496.jpg,164,29,487,323,cake', 1: 'gs://your_gcloud_bucket_name/food_data/106501.jpg,42,259,251,465,cake', 2: 'gs://your_gcloud_bucket_name/food_data/106501.jpg,139,120,307,298,cake', 3: 'gs://your_gcloud_bucket_name/food_data/110954.jpg,15,80,367,460,cake', 4: 'gs://your_gcloud_bucket_name/food_data/110954.jpg,1,82,111,288,cake', 5: 'gs://your_gcloud_bucket_name/food_data/112438.jpg,30,26,427,384,cake', 6: 'gs://your_gcloud_bucket_name/food_data/114735.jpg,53,172,320,466,cake', 7: 'gs://your_gcloud_bucket_name/food_data/114847.jpg,45,132,261,331,cake', 8: 'gs://your_gcloud_bucket_name/food_data/114847.jpg,255,69,464,274,cake', 9: 'gs://your_gcloud_bucket_name/food_data/122408.jpg,112,139,240,269,cake', 10: 'gs://your_gcloud_bucket_name/food_data/122408.jpg,199,193,335,326,cake', 11: 'gs://your_gcloud_bucket_name/food_data/122408.jpg,264,118,403,260,cake', 12: 'gs://your_gcloud_bucket_name/food_data/125669.jpg,10,115,117,212,cake', 13: 'gs: