## Overview
This notebook is for reassembling XML annotations that were done on images that had been split into 9 tiles.  The boxes are compiled into one annotation file but no attempt is made to weed out possible duplicates where parts of a single animal might be seen at the edge of two files. 

I gave up on reassembling the images themselves because the 9 tiles are not the same size, which makes reassembly difficult.  

In [1]:
!hostname

spearhead4


## Imports

In [2]:
#Don't re-run (adds a directory each time)
#Insert at position 1, because 0 is the scriptath (or '' in REPL)
import sys
from pathlib import Path
sys.path.insert(1,str(Path.cwd().parent))
print(sys.path)

['/home/egdod/tanzania_dev/VM', '/home/egdod/tanzania_dev', '/anaconda/envs/trident/lib/python38.zip', '/anaconda/envs/trident/lib/python3.8', '/anaconda/envs/trident/lib/python3.8/lib-dynload', '', '/anaconda/envs/trident/lib/python3.8/site-packages', '/anaconda/envs/trident/lib/python3.8/site-packages/IPython/extensions', '/home/egdod/.ipython']


In [120]:
from pathlib import Path
from lxml import etree as et
from PIL import Image
from torchvision.transforms.functional import pad as tvpad
import torch
import random
import numpy as np
import re #regex
import subprocess
import pandas as pd
import zipfile
import shutil
import os
#import pyvips
import copy
from tqdm import tqdm

In [4]:
from trident_project.dev_packages.pascal_voc_writer.pascal_voc_writer import Writer

## Set paths

In [6]:
#Set main paths
annotation_source_path = Path('/cdata/tanzania/annotated_images/AIAIA/pvoc') #Where original zipped annotation files are

## Unzip MXJ annotation files and dump to flat directory



### Rename '.zip.zip' to '.zip'

In [None]:
annotation_files = [str(x) for x in annotation_source_path.iterdir() if x.suffix == '.zip'] #convert strings to Paths and get parent folder

for f in annotation_files:
    if f.endswith('.zip.zip'):
        new_name = str(Path(f).with_suffix('').with_suffix('.zip'))
        f = Path(f)
        f.rename(new_name)

In [7]:
#Get Pascal-VOC zipped annotation files from annotation_source_path
#Note: the list has to be remade after renaming .zip.zip files, and here we just take tiled annotations (MXJ)
annotation_files = [str(x) for x in annotation_source_path.iterdir() if x.suffix == '.zip' and 'mxj2019' in str(x)] #convert strings to Paths and get parent folder
annotation_files

['/cdata/tanzania/annotated_images/AIAIA/pvoc/mxj2019CB_b-2020_08_26_08_49_31-pvoc.zip',
 '/cdata/tanzania/annotated_images/AIAIA/pvoc/mxj2019FB_b-2020_08_26_08_49_31-pvoc.zip',
 '/cdata/tanzania/annotated_images/AIAIA/pvoc/mxj2019Ia-2020_08_26_08_49_31-pvoc.zip',
 '/cdata/tanzania/annotated_images/AIAIA/pvoc/mxj2019Ga-2020_08_26_08_49_31-pvoc.zip',
 '/cdata/tanzania/annotated_images/AIAIA/pvoc/mxj2019Ja-2020_08_26_08_49_31-pvoc.zip',
 '/cdata/tanzania/annotated_images/AIAIA/pvoc/mxj2019DA_a-2020_08_26_08_49_31-pvoc.zip',
 '/cdata/tanzania/annotated_images/AIAIA/pvoc/mxj2019CA_a-2020_08_26_08_49_31-pvoc.zip',
 '/cdata/tanzania/annotated_images/AIAIA/pvoc/mxj2019Aa-2020_08_26_08_49_31-pvoc.zip',
 '/cdata/tanzania/annotated_images/AIAIA/pvoc/mxj2019EA_a-2020_08_26_08_49_31-pvoc.zip',
 '/cdata/tanzania/annotated_images/AIAIA/pvoc/mxj2019LA-2020_08_26_08_49_31-pvoc.zip',
 '/cdata/tanzania/annotated_images/AIAIA/pvoc/mxj2019Ha-2020_08_26_08_49_31-pvoc.zip',
 '/cdata/tanzania/annotated_image

In [10]:
#MODIFIED from original: Unzips a single annotation zipfile into a temporary directory.  IGNORES relative filepaths.
#Takes only .xml files.  Does not clean the output directory first (allows accumulation from multiple calls).
def unzip_ann_file(ann_file,tempdir):
    #Unzip the archive files into the temporary directory, in two steps
    #Warning: the -o flag can’t have spaces between it and the filename, and can only have ONE space before it
    outdir = '-o' + str(tempdir) 
    #Extract the files complete with relative filepaths (more stable than other methods tried)
    subprocess.call(["7z","e",ann_file,outdir,"*/*.xml","-r"]) #ann_file,outdir
    print("7z e ",ann_file,outdir,"*/*.xml","-r")

In [11]:
tempdir = '/cdata/tanzania/temp/annotations/MXJ-2019'
for f in annotation_files:
    unzip_ann_file(f,tempdir)

7z e  /cdata/tanzania/annotated_images/AIAIA/pvoc/mxj2019CB_b-2020_08_26_08_49_31-pvoc.zip -o/cdata/tanzania/temp/annotations/MXJ-2019 */*.xml -r
7z e  /cdata/tanzania/annotated_images/AIAIA/pvoc/mxj2019FB_b-2020_08_26_08_49_31-pvoc.zip -o/cdata/tanzania/temp/annotations/MXJ-2019 */*.xml -r
7z e  /cdata/tanzania/annotated_images/AIAIA/pvoc/mxj2019Ia-2020_08_26_08_49_31-pvoc.zip -o/cdata/tanzania/temp/annotations/MXJ-2019 */*.xml -r
7z e  /cdata/tanzania/annotated_images/AIAIA/pvoc/mxj2019Ga-2020_08_26_08_49_31-pvoc.zip -o/cdata/tanzania/temp/annotations/MXJ-2019 */*.xml -r
7z e  /cdata/tanzania/annotated_images/AIAIA/pvoc/mxj2019Ja-2020_08_26_08_49_31-pvoc.zip -o/cdata/tanzania/temp/annotations/MXJ-2019 */*.xml -r
7z e  /cdata/tanzania/annotated_images/AIAIA/pvoc/mxj2019DA_a-2020_08_26_08_49_31-pvoc.zip -o/cdata/tanzania/temp/annotations/MXJ-2019 */*.xml -r
7z e  /cdata/tanzania/annotated_images/AIAIA/pvoc/mxj2019CA_a-2020_08_26_08_49_31-pvoc.zip -o/cdata/tanzania/temp/annotations/MXJ-

In [22]:
#Count annotation files
!ls '/cdata/tanzania/temp/annotations/MXJ-2019' | wc -l

608


## Bring in csv file of checked images
This is needed for figuring out which images are missing annotations.  Howard sent the list of checked images.

In [12]:
processed_images = pd.read_csv('/cdata/tanzania/MXJ_annotated.csv')

In [13]:
processed_images.columns

Index(['Unnamed: 0', 'SourceFile', 'FileName', 'FileSize', 'Model',
       'DateTimeOriginal', 'ImageSize', 'Quality', 'FocalLength',
       'ShutterSpeed', 'Aperture', 'ISO', 'WhiteBalance', 'Flash', 'base',
       'side', 'ImageID', 'geom', 'select'],
      dtype='object')

In [14]:
processed_images['SourceFile'][:5]

1    mxj-L_20190327090234.jpg
2    mxj-L_20190327090236.jpg
3    mxj-L_20190327090238.jpg
4    mxj-L_20190327090240.jpg
5    mxj-L_20190327090242.jpg
Name: SourceFile, dtype: object

In [18]:
mxj_images = list(processed_images['SourceFile'])
type(mxj_images),len(mxj_images)

(list, 14180)

In [24]:
#Remove the suffix so we can match with xml files
mxj_images = [str(Path(f).with_suffix('')) for f in mxj_images]
mxj_images[:5]

['mxj-L_20190327090234',
 'mxj-L_20190327090236',
 'mxj-L_20190327090238',
 'mxj-L_20190327090240',
 'mxj-L_20190327090242']

In [23]:
#So: 4.3% of the files _could_ have a matching annotation, but since the annotation files represent 
#tiles (1-9), they are probably clumped and the real number could be as low as 1/9th of that.
608/14000

0.04342857142857143

## Get a list of annotation files

In [31]:
mxj_ann_dir = Path('/cdata/tanzania/temp/annotations/MXJ-2019')
ann_files = list(mxj_ann_dir.glob('*.xml'))
ann_files[:5]

[PosixPath('/cdata/tanzania/temp/annotations/MXJ-2019/mxj-L_20190328100158_2.xml'),
 PosixPath('/cdata/tanzania/temp/annotations/MXJ-2019/mxj-L_20190328110722_7.xml'),
 PosixPath('/cdata/tanzania/temp/annotations/MXJ-2019/mxj-L_20190328093142_7.xml'),
 PosixPath('/cdata/tanzania/temp/annotations/MXJ-2019/mxj-L_20190328081158_7.xml'),
 PosixPath('/cdata/tanzania/temp/annotations/MXJ-2019/mxj-L_20190328111252_8.xml')]

In [65]:
# If every one of the 9 image tiles had animals in it, there would be 9 annotation 
# files, but in reality there are often fewer.
def get_annotation_group(ann_path,fname):
    """Finds up to 9 annotation files that correspond with the image file given.
    Parameters:
    ann_path: pathlib Path. Directory to search for annotation files.
    fname: string. filename to search for, without suffix.
    """
    pat = '*' + fname + '*'
    ninefiles = list(ann_path.glob(pat))
    ninefiles = [str(f) for f in ninefiles] #convert from Path to strings
    return ninefiles

### Check to see how many images have annotations

In [51]:
#Test
def count_matching_annfiles(ann_path,img_list):
    fc = 0
    for fname in img_list:
        ninefiles = get_annotation_group(ann_path,fname)
        if len(ninefiles) > 0:
            fc +=1
            #print a few examples for debugging
            if fc < 10:
                print(fname,'')
    return fc

In [52]:
#The 10 files listed all contain animals
count_matching_annfiles(mxj_ann_dir,mxj_images)

mxj-L_20190328074652 
mxj-L_20190328074654 
mxj-L_20190328074754 
mxj-L_20190328074758 
mxj-L_20190328074816 
mxj-L_20190328075042 
mxj-L_20190328075134 
mxj-L_20190328075136 
mxj-L_20190328075140 


316

In [38]:
316/14000 # 2.2% of the images have an associated annotation

0.022571428571428572

## Define some functions

In [42]:
def get_bbox_offset(tpos, tw, th):
    """Returns an offset as a tuple (width, height) in pixels for a 9-tile system. 
    Parameters:
    tpos: int. tile position: 1-9 reading 3 rows from top to bottom and left to right.
    tw: int. Tile width in pixels
    th: int. Tile height in pixels."""
    if tpos == 0: return(0, 0)
    elif tpos==1: return (tw, 0) 
    elif tpos==2: return (2 * tw, 0)
    elif tpos==3: return(0, th)
    elif tpos==4: return(tw,th)
    elif tpos==5: return(2 * tw,th)
    elif tpos==6: return(0, 2 * th)
    elif tpos==7: return(tw, 2 * th)
    elif tpos==8: return(2 * tw,2 * th)
    else: stop("Expected a tile position between 0 and 8")

In [43]:
def adjust_bbox(o,x_adj,y_adj):
    """Adjusts coordinates of a bounding box.  Assumes that 'o' is an lxml tree rooted in 'object',
    of the form object/bndbox/[xmin,xmax, ymin, ymax].  
    Parameters:
    x_adj, y_adj: int. Adjustment in x and y, respectively.
    WARNING: modifies the tree object, 'o'."""
    xmin_node = o.xpath('bndbox/xmin')
    xmax_node = o.xpath('bndbox/xmax')
    ymin_node = o.xpath('bndbox/ymin')
    ymax_node = o.xpath('bndbox/ymax')
    xmin = int(float(xmin_node[0].text))
    xmax = int(float(xmax_node[0].text))
    ymin = int(float(ymin_node[0].text))
    ymax = int(float(ymax_node[0].text))
    xmin_node[0].text = str(xmin + x_adj)
    xmax_node[0].text = str(xmax + x_adj)
    ymin_node[0].text = str(ymin + y_adj)
    ymax_node[0].text = str(ymax + y_adj)
    return o

In [117]:
#Uses pascal_voc_writer to create a new empty annotation file for each image in a list.
# missing_ann is expected to be a list of full filepaths without extensions.  
def write_empty_annotations(imagepath,missing_ann,an_dest):
    """Creates a Pascal-VOC style XML annotation file for each file in a list of files that are assumed 
    to not include any objects of interest.  Uses pascal_voc_writer.
    Parameters:
    imagepath: string. Folder where images that match the annotations are found
    missing_ann: list of strings.  Filepaths without suffix.  This is a list of annotation files to be made.
    an_dest: string. Folder for output.
    Returns a list of 'bad' files (a subset of missing_ann) for which no matching image could be found.
    """
    nc = 0
    badlist = []
    if(len(missing_ann) > 0):
        for filestem in missing_ann:
            image = (Path(imagepath)/filestem).with_suffix('.jpg')
            relpath = Path(filestem).with_suffix('.jpg')
            filename = str(image.name)
            folder = str(image.parent)            
            try:
                im = Image.open(image)
            except FileNotFoundError:
                #print('Could not create xml file: image not found: ',image)
                bad_xml_file = (Path(an_dest)/filestem).with_suffix('.xml')
                badlist.append(str(bad_xml_file))
                next
            else:
                width, height = im.size
                writer = Writer(filename, width, height) # Writer(path, width, height)
                writer.changePath(relpath)
                writer.changeFolder(folder)
                outfile = str((Path(an_dest)/filestem).with_suffix('.xml'))
                writer.save(outfile)
                nc +=1
    #print("   Created " + str(nc) + " new annotation files; ")
    if (len(badlist) > 0):
        print("Warning: No annotation file written for ",len(badlist)," xml files with no match to images.")
    return(badlist)

In [99]:
def get_image_size(imagefile):
    """Opens an image file and returns the image size as tuple(width,height) or None.
    imagefile: string.  File to open."""
    try:
        im = Image.open(imagefile)
    except FileNotFoundError as e:
        print(e)
        return None
    else:
        width, height = im.size
        return (width,height)

In [127]:
def reunite_ninetiles(imagepath,imgfile,annotation_src,annotation_dest):
    """Find up to 9 XML annotation files for a tiled image file.  Adjust the bounding box
    coordinates in each tile to correspond to the box's position in the original image; 
    accumulate all objects and write a single XML annotation file corresponding to the 
    full-size (un-tiled) original image.  Edit the filename, folder, width, and height 
    elements to correspond to the original image.  Write an annotation file with no objects
    if there are no annotation files found for 'imgfile'.
    Parameters:
     - imagepath: pathlib Path. Folder containing 'imgfile'.
     - imgfile: string. Image filename without suffix or path
     - annotation_src: pathlib Path.  Folder where .xml annotation files are.
     - annotation_dest: pathlib Path Destination folder for output XML file.
    """
    parser = et.XMLParser(remove_blank_text=True)
    ninefiles = get_annotation_group(annotation_src,imgfile) #There may be fewer than 9
    n_files = len(ninefiles)
    if n_files > 0:
        for i in range(n_files):
            tree = et.parse(str(ninefiles[i]),parser)  

            #All tiles including the first may need to have coordinates adjusted
            tile_position = int(re.split('_([0-9]*)(\.xml)', ninefiles[i])[1])
            #Retrieve tile height and width (assumed to be the same for all tiles)
            tw = int(float(tree.xpath('/annotation/size/width/text()')[0]))
            th = int(float(tree.xpath('/annotation/size/height/text()')[0]))
            #Calculate the bbox adjustment, based on the tile position
            (x_adj,y_adj) = get_bbox_offset(tile_position, tw, th)
            #Accumulate object data.  Adjust the text in the bbox coordinates but leave all other XML unchanged.
            objects = tree.xpath('//object')
            for o in objects:
                o = adjust_bbox(o,x_adj,y_adj) #Warning: modifies the tree object

            #Use the XML tree of the first tile for image settings.
            #Note that they are assumed to be the same for all of the 9 tiles.
            if i==0:
                #Set a new filename and filepath
                filename = str(Path(imgfile).with_suffix('.xml'))
                folder = 'MXJ-2019'
                tree.xpath('/annotation/filename')[0].text = filename
                tree.xpath('/annotation/folder')[0].text = folder
                #Set the image width and height from the full-size original image
                img = str(imagepath/Path(imgfile).with_suffix('.jpg'))
                (width, height) = get_image_size(img)
                tree.xpath('/annotation/size/width')[0].text = str(width)
                tree.xpath('/annotation/size/height')[0].text = str(height)
                master_tree = copy.deepcopy(tree)
            else:
                if n_files > 1:
                    #Append <object> onto the last position of <annotation>.
                    ann_element = master_tree.xpath('/annotation')[0]
                    for o in objects:     
                        ann_element.append(o)
        outpath = str(annotation_dest/Path(imgfile).with_suffix('.xml'))
        master_tree.write(outpath, pretty_print=True)
        n_objects = len(master_tree.xpath('//object'))
    else:
        write_empty_annotations(imagepath,[imgfile],annotation_dest)
        n_objects = 0
    return n_objects

# Call the function

In [104]:
imagepath = Path('/cdata/tanzania/annotated_images/MXJ-2019')
imgfile_list = mxj_images
annotation_src = mxj_ann_dir
annotation_dest = Path('/cdata/tanzania/temp/mxj_output')

In [128]:
assert (len(imgfile_list)==len(set(imgfile_list))),'Imagefile list must not contain duplicate names'
object_count = []
for i in tqdm(range(len(imgfile_list))):
    imgfile = imgfile_list[i]
    n_objects = reunite_ninetiles(imagepath,imgfile,annotation_src,annotation_dest)
    object_count.append(n_objects)

100%|██████████| 14180/14180 [03:11<00:00, 74.00it/s]


In [130]:
len(object_count)

14180