# Imports and Dataframe Setup

In [1]:
import numpy as np
import pandas as pd
import textwrap
from xml.sax.saxutils import escape

In [2]:
df = pd.read_csv('pneumonia_bbs.csv')
print(df.head(10))

                              patientId  x_min1  y_min1  x_max1  y_max1  \
0  035789b1-3736-405d-9910-f8f23c62ae9f      60      88      90     111   
1  03a9498c-549d-4e7d-800b-e74797f7f625      49     128     103     195   
2  03ae75b6-45a7-4a68-9871-f07b82b17bf6      52      99      98     222   
3  03cd7a5b-d5d7-40a1-81b1-c4264920530a      71      42     125     185   
4  03d92597-3e33-4fdf-8db5-a27cf5b8d3eb     148      60     194     144   
5  03e9a70f-3de8-4e13-b3f2-9dd6d75f496d      58      98      88     158   
6  03edb5ed-9e76-4abe-bc35-7bc95fea7e6a      46     102      93     179   
7  040a0743-f663-4746-8224-f0e3bacc7ba5      45     126     104     177   
8  0499513a-5d48-4cf9-aac8-115e2a52fe1a      95     153     122     194   
9  049d7317-5a8b-4fc4-b81f-159fe6b45a92     161      94     211     118   

   x_min2  y_min2  x_max2  y_max2  x_min3  y_min3  x_max3  y_max3  x_min4  \
0   173.0    84.0   208.0   132.0     NaN     NaN     NaN     NaN     NaN   
1   152.0   150.0   

In [3]:
# create df for entries with only 1 bounding box
df_1box = df[df.x_min2.isnull()]
df_1box = df_1box.drop(['x_min2', 'y_min2',
       'x_max2', 'y_max2', 'x_min3', 'y_min3', 'x_max3', 'y_max3', 'x_min4',
       'y_min4', 'x_max4', 'y_max4'], axis=1)
df_1box.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2475 entries, 4 to 5625
Data columns (total 5 columns):
patientId    2475 non-null object
x_min1       2475 non-null int64
y_min1       2475 non-null int64
x_max1       2475 non-null int64
y_max1       2475 non-null int64
dtypes: int64(4), object(1)
memory usage: 116.0+ KB


In [16]:
# create df for entries with 2 bounding boxes
df_2box = df[df.x_min3.isnull() & df.x_min2.notnull()]
df_2box = df_2box.drop(['x_min3', 'y_min3', 'x_max3', 'y_max3', 'x_min4',
       'y_min4', 'x_max4', 'y_max4'], axis=1)
df_2box.loc[:, 'x_min1':] = df_2box.loc[:, 'x_min1':].astype(int)
# print(df_2box.head(10))
print(df_2box.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3036 entries, 0 to 5626
Data columns (total 9 columns):
patientId    3036 non-null object
x_min1       3036 non-null int32
y_min1       3036 non-null int32
x_max1       3036 non-null int32
y_max1       3036 non-null int32
x_min2       3036 non-null int32
y_min2       3036 non-null int32
x_max2       3036 non-null int32
y_max2       3036 non-null int32
dtypes: int32(8), object(1)
memory usage: 142.3+ KB
None


In [17]:
# create df for entries with 3 bounding boxes
df_3box = df[df.x_min4.isnull() & df.x_min3.notnull() & df.x_min3.notnull()]
df_3box = df_3box.drop(['x_min4',
       'y_min4', 'x_max4', 'y_max4'], axis=1)
df_3box.loc[:, 'x_min1':] = df_3box.loc[:, 'x_min1':].astype(int)
print(df_3box.head())

                                patientId  x_min1  y_min1  x_max1  y_max1  \
1    03a9498c-549d-4e7d-800b-e74797f7f625      49     128     103     195   
7    040a0743-f663-4746-8224-f0e3bacc7ba5      45     126     104     177   
112  085a6486-c3b3-4ca2-80f1-be07d23e853b      37      81      76     137   
159  098e14d4-3205-4c2d-a059-738f830c0aa5      31     148      67     195   
202  0afeabf5-d5a3-454f-a50d-887f0d21106c      49      81      78     126   

     x_min2  y_min2  x_max2  y_max2  x_min3  y_min3  x_max3  y_max3  
1       152     150     219     211     153      53     180      76  
7        73      57      95      93     159      60     220     197  
112      83     105     122     168     166      58     212     129  
159      42      76      96     161     122      80     203     194  
202      66     111     107     173     145     100     221     183  


In [18]:
df_3box.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 105 entries, 1 to 5511
Data columns (total 13 columns):
patientId    105 non-null object
x_min1       105 non-null int32
y_min1       105 non-null int32
x_max1       105 non-null int32
y_max1       105 non-null int32
x_min2       105 non-null int32
y_min2       105 non-null int32
x_max2       105 non-null int32
y_max2       105 non-null int32
x_min3       105 non-null int32
y_min3       105 non-null int32
x_max3       105 non-null int32
y_max3       105 non-null int32
dtypes: int32(12), object(1)
memory usage: 6.6+ KB


# Set up Templates

In [19]:
template_1box = textwrap.dedent("""\
<annotation>
    <folder>VOC2007</folder>
    <filename>{filename}.jpg</filename>
    <source>
        <database>The VOC2007 Database</database>
        <annotation>PASCAL VOC2007</annotation>
        <image>flickr</image>
        <flickrid>336426776</flickrid>
    </source>
    <owner>
        <flickrid>Elder Timothy Chaves</flickrid>
        <name>Tim Chaves</name>
    </owner>
    <size>
        <width>256</width>
        <height>256</height>
        <depth>3</depth>
    </size>
    <segmented>0</segmented>
    <object>
        <name>opacity</name>
        <pose>unknown</pose>
        <truncated>0</truncated>
        <difficult>0</difficult>
        <bndbox>
            <xmin>{xmin1}</xmin>
            <ymin>{ymin1}</ymin>
            <xmax>{xmax1}</xmax>
            <ymax>{ymax1}</ymax>
        </bndbox>
    </object>
</annotation>""")

In [20]:
template_2box = textwrap.dedent("""\
<annotation>
    <folder>VOC2007</folder>
    <filename>{filename}.jpg</filename>
    <source>
        <database>The VOC2007 Database</database>
        <annotation>PASCAL VOC2007</annotation>
        <image>flickr</image>
        <flickrid>336426776</flickrid>
    </source>
    <owner>
        <flickrid>Elder Timothy Chaves</flickrid>
        <name>Tim Chaves</name>
    </owner>
    <size>
        <width>256</width>
        <height>256</height>
        <depth>3</depth>
    </size>
    <segmented>0</segmented>
    <object>
        <name>opacity</name>
        <pose>unknown</pose>
        <truncated>0</truncated>
        <difficult>0</difficult>
        <bndbox>
            <xmin>{xmin1}</xmin>
            <ymin>{ymin1}</ymin>
            <xmax>{xmax1}</xmax>
            <ymax>{ymax1}</ymax>
        </bndbox>
    </object>
    <object>
        <name>opacity</name>
        <pose>unknown</pose>
        <truncated>0</truncated>
        <difficult>0</difficult>
        <bndbox>
            <xmin>{xmin2}</xmin>
            <ymin>{ymin2}</ymin>
            <xmax>{xmax2}</xmax>
            <ymax>{ymax2}</ymax>
        </bndbox>
    </object>
</annotation>""")

In [21]:
template_3box = textwrap.dedent("""\
<annotation>
    <folder>VOC2007</folder>
    <filename>{filename}.jpg</filename>
    <source>
        <database>The VOC2007 Database</database>
        <annotation>PASCAL VOC2007</annotation>
        <image>flickr</image>
        <flickrid>336426776</flickrid>
    </source>
    <owner>
        <flickrid>Elder Timothy Chaves</flickrid>
        <name>Tim Chaves</name>
    </owner>
    <size>
        <width>256</width>
        <height>256</height>
        <depth>3</depth>
    </size>
    <segmented>0</segmented>
    <object>
        <name>opacity</name>
        <pose>unknown</pose>
        <truncated>0</truncated>
        <difficult>0</difficult>
        <bndbox>
            <xmin>{xmin1}</xmin>
            <ymin>{ymin1}</ymin>
            <xmax>{xmax1}</xmax>
            <ymax>{ymax1}</ymax>
        </bndbox>
    </object>
    <object>
        <name>opacity</name>
        <pose>unknown</pose>
        <truncated>0</truncated>
        <difficult>0</difficult>
        <bndbox>
            <xmin>{xmin2}</xmin>
            <ymin>{ymin2}</ymin>
            <xmax>{xmax2}</xmax>
            <ymax>{ymax2}</ymax>
        </bndbox>
    </object>
    <object>
        <name>opacity</name>
        <pose>unknown</pose>
        <truncated>0</truncated>
        <difficult>0</difficult>
        <bndbox>
            <xmin>{xmin3}</xmin>
            <ymin>{ymin3}</ymin>
            <xmax>{xmax3}</xmax>
            <ymax>{ymax3}</ymax>
        </bndbox>
    </object>
</annotation>""")

# XML Converter

In [19]:
# df_2box_short = df_2box[:1000]
# print(df_2box_short.head())

                              patientId  x_min1  y_min1  x_max1  y_max1  \
0  035789b1-3736-405d-9910-f8f23c62ae9f      60      88      90     111   
2  03ae75b6-45a7-4a68-9871-f07b82b17bf6      52      99      98     222   
3  03cd7a5b-d5d7-40a1-81b1-c4264920530a      71      42     125     185   
5  03e9a70f-3de8-4e13-b3f2-9dd6d75f496d      58      98      88     158   
6  03edb5ed-9e76-4abe-bc35-7bc95fea7e6a      46     102      93     179   

   x_min2  y_min2  x_max2  y_max2  
0     173      84     208     132  
2     146      92     206     208  
3     167      31     230     218  
5     171      81     204     141  
6     158     118     180     217  


In [22]:
list_dict_1box = []

for index, row in df_1box.iterrows():
    dict = {
    'filename': row['patientId'],
    'xmin1': str(row['x_min1']),
    'ymin1': str(row['y_min1']),
    'xmax1': str(row['x_max1']),
    'ymax1': str(row['y_max1']),
}
    list_dict_1box.append(dict)

In [23]:
list_dict_2box = []

for index, row in df_2box.iterrows():
    dict = {
    'filename': row['patientId'],
    'xmin1': str(row['x_min1']),
    'ymin1': str(row['y_min1']),
    'xmax1': str(row['x_max1']),
    'ymax1': str(row['y_max1']),
    'xmin2': str(row['x_min2']),
    'ymin2': str(row['y_min2']),
    'xmax2': str(row['x_max2']),
    'ymax2': str(row['y_max2']),
}
    list_dict_2box.append(dict)

In [24]:
list_dict_3box = []

for index, row in df_3box.iterrows():
    dict = {
    'filename': row['patientId'],
    'xmin1': str(row['x_min1']),
    'ymin1': str(row['y_min1']),
    'xmax1': str(row['x_max1']),
    'ymax1': str(row['y_max1']),
    'xmin2': str(row['x_min2']),
    'ymin2': str(row['y_min2']),
    'xmax2': str(row['x_max2']),
    'ymax2': str(row['y_max2']),
    'xmin3': str(row['x_min3']),
    'ymin3': str(row['y_min3']),
    'xmax3': str(row['x_max3']),
    'ymax3': str(row['y_max3']),
}
    list_dict_3box.append(dict)

In [25]:
print(list_dict_1box[0])
print(list_dict_2box[0])
print(list_dict_3box[0])

{'filename': '03d92597-3e33-4fdf-8db5-a27cf5b8d3eb', 'xmin1': '148', 'ymin1': '60', 'xmax1': '194', 'ymax1': '144'}
{'filename': '035789b1-3736-405d-9910-f8f23c62ae9f', 'xmin1': '60', 'ymin1': '88', 'xmax1': '90', 'ymax1': '111', 'xmin2': '173', 'ymin2': '84', 'xmax2': '208', 'ymax2': '132'}
{'filename': '03a9498c-549d-4e7d-800b-e74797f7f625', 'xmin1': '49', 'ymin1': '128', 'xmax1': '103', 'ymax1': '195', 'xmin2': '152', 'ymin2': '150', 'xmax2': '219', 'ymax2': '211', 'xmin3': '153', 'ymin3': '53', 'xmax3': '180', 'ymax3': '76'}


In [30]:
print(len(list_dict_1box))
print(len(list_dict_2box))
print(len(list_dict_3box))

2475
3036
105


# Output

In [31]:
# # output all 1-box .xml files
for i in list_dict_1box:
    escaped = {k: escape(v) for k, v in i.items()}
    data = template_1box.format(**escaped)
    open("{}.xml".format(i['filename']), "w").write(data)

In [33]:
# # output all 2-box .xml files
for i in list_dict_2box:
    escaped = {k: escape(v) for k, v in i.items()}
    data = template_2box.format(**escaped)
    open("{}.xml".format(i['filename']), "w").write(data)

In [34]:
# output all 3-box .xml files
for i in list_dict_3box:
    escaped = {k: escape(v) for k, v in i.items()}
    data = template_3box.format(**escaped)
    open("{}.xml".format(i['filename']), "w").write(data)