In [62]:
# OCR from Google Vision API
from google.cloud import vision

# Image processing
from rasterio.enums import Resampling
from PIL import Image, ImageFilter
from rasterio.mask import mask
from geotiff import GeoTiff
import rasterio
import cv2

# Data processing
import pandas as pd
import numpy as np

# GeoPandas
from shapely.geometry import Polygon, Point
from shapely.ops import cascaded_union
from shapely import geometry
import geopandas as gpd

# Manipulation of files in disk
import glob
import sys
import io
import os

# Other utilities
from imutils.object_detection import non_max_suppression
import time
import re

# Model v1: Split the image, detect text areas, and convert to text strings

## Splitting the image

In [63]:
# Takes a Rasterio dataset and splits it into squares of dimensions squareDim * squareDim
def splitImageIntoCells(img, filename, num_imgs=3):
    squareDim_wide = img.shape[1] // num_imgs
    squareDim_height = img.shape[0] // num_imgs
    
    numberOfCellsWide = img.shape[1] // squareDim_wide
    numberOfCellsHigh = img.shape[0] // squareDim_height
    x, y = 0, 0
    count = 0
    for hc in range(numberOfCellsHigh):
        y = hc * squareDim_height
        for wc in range(numberOfCellsWide):
            x = wc * squareDim_wide
            geom = getTileGeom(img.transform, x, y, squareDim_wide, squareDim_height)
            getCellFromGeom(img, geom, filename, count)
            count = count + 1

# Generate a bounding box from the pixel-wise coordinates using the original datasets transform property
def getTileGeom(transform, x, y, squareDim_wide, squareDim_height):
    corner1 = (x, y) * transform
    corner2 = (x + squareDim_wide, y + squareDim_height) * transform
    return geometry.box(corner1[0], corner1[1],
                        corner2[0], corner2[1])

# Crop the dataset using the generated box and write it out as a GeoTIFF
def getCellFromGeom(img, geom, filename, count):
    crop, cropTransform = mask(img, [geom], crop=True)
    writeImageAsGeoTIFF(crop,
                        cropTransform,
                        img.meta,
                        img.crs,
                        filename+"_"+str(count))

# Write the passed in dataset as a GeoTIFF
def writeImageAsGeoTIFF(img, transform, metadata, crs, filename):
    metadata.update({"driver":"GTiff",
                     "height":img.shape[1],
                     "width":img.shape[2],
                     "transform": transform,
                     "crs": 'EPSG:4686'})
    with rasterio.open(filename+".tif", "w", **metadata) as dest:
        dest.write(img)

In [64]:
def detect_text(bytes_img):
    """
    Uses the Google Vision API to extract text from
    an image.
    
    Arguments
    ---------
    file_path: str
               path of the image to process.
    
    Outputs
    -------
    response: AnnotateImageResponse object
              json like format with bounding box and other
              relevant information.
    text: str
          text extracted from the image.
    """
    client = vision.ImageAnnotatorClient()    
    image = vision.Image(content=bytes_img)
    response = client.document_text_detection(image=image)
    text = response.full_text_annotation.text
    
    return response, text


def getbytesimg_from_path(filepath):
    """
    Obtains the bytes base64 format of an image from a 
    local file path.
    
    Arguments
    ---------
    filepath: str
              Path of the image file to convert
    
    Output
    ------
    bytes_img: bytes
               base64 format of the image
    """
    with open(filepath, "rb") as image_file:
        bytes_img = image_file.read()
    
    return bytes_img

In [65]:
def split_images(filepath, num_imgs=3):
    """
    Split a large image into a grid of 3x3
    smaller images.
    
    Arguments:
    ---------
    filepath: str
              file path of the large image
    num_imgs: int (optional)
              Number of rows and columns of the grid
              
    Output
    ------
    None
    
    """
    img = rasterio.open(filepath)
    splitImageIntoCells(img, "split_out/output_data")

In [66]:
def geotif_to_jpeg(tif_filename):
    """
    Converts geotif image from disk and saves it 
    in the same folder in jpeg format.
    
    Arguments
    ---------
    tif_filename: str
                  path of the tif file to convert
    
    Output
    ------
    None
    
    """
    with rasterio.open(tif_filename) as infile:    
        profile = infile.profile    
        profile['driver']='JPEG'
        jpeg_filename = tif_filename.replace(".tif", ".jpeg")
        with rasterio.open(jpeg_filename, 'w', **profile) as dst:
            dst.write(infile.read())

In [67]:
def get_text_coords(response):
    """
    Extracts the geographic names and bounding boxes coordinates
    from an AnnotateImageResponse object (Google).
    
    Arguments
    ---------
    response: AnnotateImageResponse (Google Vision API)
              Response object after calling the document_text_detection
              function
              
    Outputs
    -------
    palabras_google: list
                     List of strings containing the geographic names
    boundings_google: list
                      List of lists of 4 vertices that define the bounding 
                      box of each geographic name
    confidence_google: list
                       List of floats representing the confidence of the 
                       Google API at detecting each handwritten text
    """
    
    palabras_google = []
    boundings_google = []
    confidence_google = []
    
    for page in response.full_text_annotation.pages:
        for block in page.blocks:
            palabra_google = ''
            boundings_google.append(block.bounding_box)
            confidence_google.append(block.confidence)
            for parrafos in block.paragraphs:
                for palabras in parrafos.words:
                    for simbolo in palabras.symbols:
                        palabra_google = palabra_google+simbolo.text
                    palabra_google = palabra_google+' '
            palabras_google.append(palabra_google.rstrip())
            
    return palabras_google, boundings_google, confidence_google

In [68]:
def get_coords(boundings, word_index, point_index, axis):
    """
    Returns x or y coordinate from the list of vertices.
    
    Arguments
    ---------
    boundings: list
               List of bounding boxes with the vertices of 
               each bounding box
    word_index: int
                Index of the desired word starting at 0
    point_index: int
                 Index of the desired point (from 0 to 3)
    coord: str
           Either x or y
           
    Output
    ------
    coordinate: float
                Coordinate of the specified word, point, and axis
    
    """
    if axis == "x":
        coordinate = boundings[word_index].vertices[point_index].x
    else:
        coordinate = boundings[word_index].vertices[point_index].y
    
    return coordinate

In [69]:
def create_geometries(filepath):
    """
    Splits the image in a grid, uses Google Vision API to extract text
    and the bounding boxes. This method creates geojson files with 
    the geographic names, their respective geodesic coordinates, the 
    confidence, and the geometries of the bounding boxes.
    
    The geojson files are saved in the folder geometries.
    
    Arguments
    ---------
    filepath: str
              Path to the geotiff image file to process
              
    Output
    ------
    None
    """

    split_images(filepath)
    images_to_proces = os.listdir(path='split_out')

    for path in images_to_proces:
        if ('.tif' in path):
            geotif_to_jpeg("split_out/" + path)
            sub_image_jpeg = path.replace('.tif','.jpeg')
            response, text = detect_text(getbytesimg_from_path("split_out/" + sub_image_jpeg))
            words, boundings, confidence = get_text_coords(response)
            img = rasterio.open("split_out/" + sub_image_jpeg)

            # Generating the sub_geometries
            geometries = []
            centroids = []

            for i in range(len(words)):
                aux_polygon = Polygon([img.xy(get_coords(boundings,i,0,"y"),get_coords(boundings,i,0,"x")),
                                       img.xy(get_coords(boundings,i,1,"y"),get_coords(boundings,i,1,"x")),
                                       img.xy(get_coords(boundings,i,2,"y"),get_coords(boundings,i,2,"x")),
                                       img.xy(get_coords(boundings,i,3,"y"),get_coords(boundings,i,3,"x")),
                                       img.xy(get_coords(boundings,i,0,"y"),get_coords(boundings,i,0,"x"))])
                geometries.append(aux_polygon)
                centroids.append(aux_polygon.representative_point())

            sub_img_gdf = gpd.GeoDataFrame(columns=["toponimo_ocr","confidence",
                "centroide_longitud","centroide_latitud","geometry"], crs=str(img.crs))
            sub_img_gdf["geometry"] = geometries
            sub_img_gdf["toponimo_ocr"] = words
            sub_img_gdf["confidence"] = confidence
            sub_img_gdf["centroide_longitud"] = [x.coords[0][0] for x in centroids]
            sub_img_gdf["centroide_latitud"] = [x.coords[0][1] for x in centroids]
            name_geometry = 'geometries/'+sub_image_jpeg.replace('.jpeg','.geojson')
            try:
                sub_img_gdf.to_file(name_geometry, driver="GeoJSON")
            except:
                print('Empty text detected, no geometries generated!')

In [70]:
def combine_geometries(filepath):
    """
    Takes all the geojson files in the geometries folder and combine
    them into a single geojson file. It combines bounding boxes that 
    intersect each other and their respective geographic names. It 
    deletes the rows whose name has only numbers.
    
    Arguments
    ---------
    filepath: str
              Path of the original image
    
    Output
    ------
    all_toponyms_gdf: GeoDataFrame
                      GeoDataFrame with all the toponyms from the original
                      image. It contains the geographic name, geodesic 
                      coordinates of the centroid, confidence and the 
                      geometry of the bounding boxes.
    
    """
    org_img = rasterio.open(filepath)
    geometries_to_process = os.listdir(path='geometries')    
    rectangles = []
    all_toponyms_gdf = gpd.GeoDataFrame(columns=["toponimo_ocr","confidence",
                "centroide_longitud","centroide_latitud","geometry"],crs=str(org_img.crs))

    for geometry in geometries_to_process:
        if ".geojson" in geometry:
            file = gpd.read_file('geometries/' + geometry)
            all_toponyms_gdf = all_toponyms_gdf.append(file)

    all_toponyms_gdf.reset_index(drop=True, inplace=True)
    all_toponyms_gdf = geojson_posprocessing(all_toponyms_gdf)
    all_toponyms_gdf.to_file('text_detected.geojson',driver='GeoJSON')
    
    return all_toponyms_gdf

In [71]:
def get_image_corners(filepath):
    """
    Returns the geodesic coordinates of the original image.
    
    Arguments
    ---------
    filepath: str
              Path of the original image
    
    Output
    ------
    image_corners: dict
                   Dictionary with two keys: "upper_left" and "lower_left"
                   The values are tuples with the latitude and longitude
                   of the image corners.
    """
    original_img = rasterio.open(filepath)
    image_corners = {
        "upper_left": original_img.xy(0,0),
        "lower_right": original_img.xy(original_img.shape[0], original_img.shape[1])
    }
    
    return image_corners

In [72]:
def empty_folders():
    """
    Remove all temporary output files in the "split_out" and
    "geometries" folders. It also deletes the final output file
    detected_text.geojson.
    
    Arguments
    ---------
    None
    
    Output
    ------
    None
    """
    folders = ["split_out", "geometries"]
    
    for folder in folders:
        files = glob.glob(f"{folder}/*")
        for f in files:
            os.remove(f)
    try:
        os.remove("text_detected.geojson")
    except:
        print("text_detected.geojson already deleted!")

In [96]:
def geojson_posprocessing(geojson_toponyms):
    """
    Performs the following cleaning processes:
     - Identifies and combines polygons that intersect each other.
     - Combines the geographic names using the x coordinate 
    of the centroids to determine which name comes first. 
     - Remove punctuation symbols.
     - Delete the entries that do not have text
    
    Arguments
    ---------
    geojson_toponyms: gpd.GeoDataFrame
                      Initial GeoDataFrame union of all split images.
    
    Output: 
    ------
    new_geojson: GeoDataFrame
                 GeoDataFrame after processing the mentioned processing
    
    """
    
    new_geojson = gpd.GeoDataFrame(columns=["toponimo_ocr","confidence",
        "centroide_longitud","centroide_latitud","geometry"],
                                    crs=geojson_toponyms.crs)
    
    # Combine polygons that intersect each other
    while(len(geojson_toponyms) != 0):
        
        indexes = []
        new_elments =[]

        for i in range(0,len(geojson_toponyms)):
            if (geojson_toponyms.geometry[0].intersects(
                                geojson_toponyms.geometry[i])):
                indexes.append(i)

        if (len(indexes)==1):
            indexes = [0]
            new_geojson = new_geojson.append(geojson_toponyms.iloc[0:1])

        else:
            for j in range(0,4):
                for idx in range(len(indexes)):
                    for i in range(0,len(geojson_toponyms)):
                        if (geojson_toponyms.geometry[indexes[idx]].intersects(
                                                  geojson_toponyms.geometry[i])):
                            indexes.append(i)
                indexes = list(set(indexes))

            geom_list = [geojson_toponyms.geometry[j] for j in indexes]
            new_elments = [(geojson_toponyms['toponimo_ocr'][row],
                            geojson_toponyms['confidence'][row],
                            geojson_toponyms['centroide_longitud'][row]) for row in indexes]

            new_elments.sort(key=lambda tup: tup[2], reverse=False)
            toponimo_ocr = [' '.join([i[0] for i in new_elments])]
            confidence = [max([i[1] for i in new_elments])]
            new_geom = [cascaded_union(geom_list)]
            centroide_longitud = [new_geom[0].representative_point().coords[0][0]]
            centroide_latitud = [new_geom[0].representative_point().coords[0][1]]
            new_word = {'toponimo_ocr': toponimo_ocr,'confidence': confidence, 
                        'centroide_longitud': centroide_longitud,
                        'centroide_latitud': centroide_latitud,'geometry': new_geom}

            aux_geojson = gpd.GeoDataFrame(new_word,columns=["toponimo_ocr",
                            "confidence", "centroide_longitud","centroide_latitud", 
                                              "geometry"], crs=geojson_toponyms.crs)
            
            new_geojson = new_geojson.append(aux_geojson)

        geojson_toponyms = geojson_toponyms.drop(
            labels=indexes, axis=0).reset_index(drop=True)      
           
    new_geojson.reset_index(drop=True,inplace=True)
    
    # Remove punctuation
    new_geojson["toponimo_ocr"] = new_geojson["toponimo_ocr"].apply(
        lambda x: re.sub(r"[^\w\s]", "", x))
    
    # Remove rows that have a sequence of 3+ numbers
    no_numbers = new_geojson["toponimo_ocr"].apply(
        lambda x: False if re.search("\d{2,}", x) else True)
    new_geojson = new_geojson.loc[no_numbers].reset_index(drop=True)
        
    return new_geojson

In [97]:
aerophotos = glob.glob("geotiffs/*.tif")
aerophotos

['geotiffs/M-1390 F-42286.tif',
 'geotiffs/M-1390 F-42290.tif',
 'geotiffs/C-1974 F-238.tif',
 'geotiffs/C-2070 F-252.tif',
 'geotiffs/C-2070 F-250.tif',
 'geotiffs/C-1974 F-240.tif']

## Suggested procedure

1. Delete all the temporary files (`empty_folders()`)
2. Create individual geometries from the path of the original image (`create_geometries(path)`)
3. Combine geometries into single geojson file (`combine_geometries()`)
4. Get coordinates of the corners of the original image (`get_image_corners(path)`)

### Testing for each of the 6 geotiff images

In [98]:
%%time
# M-1390 F-42286
empty_folders()
create_geometries(aerophotos[0])
all_toponyms_img1 = combine_geometries(aerophotos[0])
get_image_corners(aerophotos[0])

  corner1 = (x, y) * transform
  corner2 = (x + squareDim_wide, y + squareDim_height) * transform


Empty text detected, no geometries generated!
Empty text detected, no geometries generated!
CPU times: user 3.45 s, sys: 142 ms, total: 3.59 s
Wall time: 6.51 s


{'upper_left': (-75.81961718134531, 3.9789168373452815),
 'lower_right': (-75.70771187534531, 3.8674629443452813)}

In [100]:
%%time
# M-1390 F-42290
empty_folders()
create_geometries(aerophotos[1])
all_toponyms_img2 = combine_geometries(aerophotos[1])
get_image_corners(aerophotos[1])

  corner1 = (x, y) * transform
  corner2 = (x + squareDim_wide, y + squareDim_height) * transform


CPU times: user 3.69 s, sys: 213 ms, total: 3.9 s
Wall time: 7.77 s


{'upper_left': (-75.79279845128181, 4.112223605281884),
 'lower_right': (-75.69306024028181, 4.011420865281885)}

In [101]:
%%time
# C-1974 F-238.tif
empty_folders()
create_geometries(aerophotos[2])
all_toponyms_img3 = combine_geometries(aerophotos[2])
get_image_corners(aerophotos[2])

  corner1 = (x, y) * transform
  corner2 = (x + squareDim_wide, y + squareDim_height) * transform


Empty text detected, no geometries generated!
CPU times: user 3.07 s, sys: 162 ms, total: 3.23 s
Wall time: 6.57 s


{'upper_left': (-75.8740351937534, 3.8952368917533815),
 'lower_right': (-75.8089799487534, 3.8319784377533814)}

In [102]:
%%time
# C-2070 F-252.tif
empty_folders()
create_geometries(aerophotos[3])
all_toponyms_img4 = combine_geometries(aerophotos[3])
get_image_corners(aerophotos[3])

text_detected.geojson already deleted!


  corner1 = (x, y) * transform
  corner2 = (x + squareDim_wide, y + squareDim_height) * transform


CPU times: user 4.04 s, sys: 170 ms, total: 4.21 s
Wall time: 8.24 s


{'upper_left': (-75.82882085855019, 3.952130957550283),
 'lower_right': (-75.74488113955019, 3.867971405550283)}

In [103]:
%%time
# C-2070 F-250.tif
empty_folders()
create_geometries(aerophotos[4])
all_toponyms_img5 = combine_geometries(aerophotos[4])
get_image_corners(aerophotos[4])

  corner1 = (x, y) * transform
  corner2 = (x + squareDim_wide, y + squareDim_height) * transform


CPU times: user 4.09 s, sys: 171 ms, total: 4.27 s
Wall time: 7.97 s


{'upper_left': (-75.82953946582832, 4.001018766828459),
 'lower_right': (-75.74476330082832, 3.9190583278284588)}

In [104]:
%%time
# C-1974 F-240.tif
empty_folders()
create_geometries(aerophotos[5])
all_toponyms_img6 = combine_geometries(aerophotos[5])
get_image_corners(aerophotos[5])

  corner1 = (x, y) * transform
  corner2 = (x + squareDim_wide, y + squareDim_height) * transform


CPU times: user 3.39 s, sys: 156 ms, total: 3.55 s
Wall time: 7.5 s


{'upper_left': (-75.87331615888398, 3.9374634048839634),
 'lower_right': (-75.81190315188398, 3.8747698248839635)}