# Extract metadata from AlertWildfire imagery

This script loops over images scraped from the AlertWildfire website, extracting metadata from the embedded watermark. It adds a record to the `AlertWildfire.Metadata` table for each image, containing information about which station it's from, its orientation (azimuth/tilt), and its date/time stamp.

> Note: Scraping is done by the WebScraper/scrape.py script, which dumps raw images into an s3 bucket.
> This script operates on those images by mounting the bucket to a local directory with s3fs.
> A more efficient method would be scraping images to the local filesystem, then moving them to s3 after they have been parsed for metadata.
> This works fine for one image stream, and if I were to scale this pipeline I would want to optimize this process for automation anyway.

### First: import packages, and open connection to database

In [1]:
import glob, json, os, random, re
import numpy as np
from scipy import stats, signal
from datetime import datetime
from PIL import Image
import pytesseract
import pymysql
import sqlalchemy as SQL
from urllib.parse import quote_plus as QP
import matplotlib.pyplot as plt
import seaborn

%matplotlib inline
seaborn.set_theme()

HOME = os.path.expanduser('~')

In [2]:
def get_sql_url():
    sql_secrets_path = os.path.join(HOME, 'Documents', 'sql_secrets.json')
    with open(sql_secrets_path,'r') as SECRETS:
        SECRETS = json.load(SECRETS)
        SQL_PASSWD = SECRETS['PASSWD']
        SQL_USER = SECRETS['USER']
        SQL_HOST = SECRETS['HOST']
        SQL_PORT = SECRETS['PORT']
        SQL_DB = SECRETS['DB']

    return f"mysql+pymysql://{SQL_USER}:{QP(SQL_PASSWD)}@{SQL_HOST}:{SQL_PORT}/{SQL_DB}"

SQL_URL = get_sql_url()
#print(SQL_URL)
SQL_ENGINE = SQL.create_engine(SQL_URL)

In [3]:
%load_ext sql
%sql $SQL_URL

### Create image processing pipeline

We're going to need to extract metadata from images using OCR, so let's set up some helper functions

In [4]:
def get_watermark_region(img):
    '''
    Crop full image down to just the black/white rectangle in the lower left.
    Returns a cropped grayscale image ~26 pixels high.
    '''
    # Clip out the watermark line (full img width)
    W, H = img.size
    row_segment = img.crop((0, H-28, W, H-2))
    
    # Given a wide image segment with white text over black background on the
    # left side, crop out anything beyond the right edge of the black box.
    pix = np.asarray(row_segment)
    mode = stats.mode(pix, axis=0).mode[0]
    M = 1.0 * ((mode > 5) & (mode < 200))
    win = np.zeros(M.shape)
    win[win.size//2-15:win.size//2+15] = 1
    c = signal.convolve(M, win/win.sum(), mode='same')
    c = 1.0 * (c > 0.5)
    c[-10:] = 1.0
    N = np.where(c == 1.0)[0].min()
    row_h = row_segment.size[1]

    watermark = row_segment.crop((0, 0, N, row_h))
    return watermark

def extract_metadata(fname):
    '''
    Wrapper function to process a single image. Returns a dictionary with
    the following values:
    {
      station: str,
      timestamp: datetime,
      loc: tuple(double, double double)
    }
    '''
    with Image.open(fname) as img:
        # Convert to grayscale
        img = img.convert('L')
        
        # Extract the watermark segment of the image
        watermark = get_watermark_region(img)

        # Read the watermark text using OCR
        stamp = pytesseract.image_to_string(watermark)
        stamp = stamp.strip().replace(' ', '').lower()

        ret = dict()
        
        # Extract data fields from the watermark text
        stmatch = re.search(r'axis-(.*)\s*x:', stamp)
        if stmatch:
            ret['station'] = stmatch.groups()[0].title()
        stmatch = re.search(r'axis-(.*)\s*elev', stamp)
        if stmatch:
            ret['station'] = stmatch.groups()[0].title()
        
        xyzmatch = re.search(r'x:(.*)y:(.*)z:([0-9\.-]*).*or', stamp)
        if xyzmatch:
            X, Y, Z = [((float(v)+1.0)-1.0) for v in xyzmatch.groups()]
            ret['loc'] = (X, Y, Z)
        
        dtmatch = re.search(r'(\d\d\d\d/\d\d/\d\d\d\d:\d\d:\d\d)', stamp)
        if dtmatch:
            dtstr = dtmatch.groups()[0]
            ret['timestamp'] = datetime.strptime(dtstr, '%Y/%m/%d%H:%M:%S')
        else:
            dtmatch = re.search(r'(\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d)', fname)
            if dtmatch:
                dtstr = dtmatch.groups()[0]
                ret['timestamp'] = datetime.strptime(dtstr, '%Y-%m-%dT%H:%M:%S')
        
        if all([(key in ret) for key in ('station', 'loc', 'timestamp')]):
            return ret
        else:
            plt.imshow(watermark, cmap=plt.cm.gray)
            plt.show()
            return stamp

### Now, define functions for processing images from the filesystem

In [5]:
'''
Define some SQL queries to use when processing images.
'''
insert = SQL.text(
        "INSERT INTO Metadata " +
        "(StationID, DateTime, X, Y, Z, Path) " +
        "VALUES (:stationid, :timestamp, :x, :y, :z, :path);"
    ).bindparams(
        SQL.bindparam("stationid", type_=SQL.Integer),
        SQL.bindparam("timestamp", type_=SQL.DateTime),
        SQL.bindparam("x", type_=SQL.Float),
        SQL.bindparam("y", type_=SQL.Float),
        SQL.bindparam("z", type_=SQL.Float),
        SQL.bindparam("path", type_=SQL.String)
    )

def query_existing_records():
    '''
    Cache all station ID's for faster processing later (avoids needing to make
    separate queries to the `Stations` table for each image).
    '''
    with SQL_ENGINE.connect() as conn:
        station_records = conn.execute(SQL.text("SELECT * FROM Stations;"))
        stations = {st[1]: st[0] for st in station_records}

        image_path_records = conn.execute(SQL.text("SELECT Path from Metadata;"))
        db_image_paths = [r[0] for r in image_path_records]

    return (stations, db_image_paths)

def process_images(data_dir, stations, db_image_paths):
    '''
    Now finally, loop through all images in the data path, parse
    each one's metadata, then add it to the Metadata table.
    '''
    with SQL_ENGINE.connect().execution_options(autocommit=True) as conn:
        img_paths_glob = os.path.join(data_dir, '*', '*.png')

        all_img_paths = glob.glob(img_paths_glob)
        new_img_paths = [P for P in all_img_paths if not os.path.basename(P) in db_image_paths]

        N_TOTAL = len(all_img_paths)
        N_NEW = len(new_img_paths)
        N_DUP = N_TOTAL-N_NEW
        N_ADDED, N_PROCESSED, N_ERR = (0, 0, 0)
        for fname in sorted(new_img_paths):
            N_PROCESSED += 1
            COMPLETED = '%5.01f%%'%(N_PROCESSED/N_NEW*100)
            print(COMPLETED, end='\r')

            basename = os.path.basename(fname)

            #Extract metadata from image
            mdata = extract_metadata(fname)        

            if mdata is None or type(mdata) is str:
                N_ERR += 1
                print(("Unable to parse metadata from image: "+basename).ljust(100))
                print(mdata)
                continue

            if not mdata['station'] in stations.keys():
                N_ERR += 1
                print(("Cannot find station: "+mdata['station']).ljust(100))
                continue

            # Add record
            X, Y, Z = mdata['loc']
            datarow = dict(stationid=stations[mdata['station']],
                           timestamp=mdata['timestamp'],
                           x=X, y=Y, z=Z, path=basename)

            conn.execute(insert, datarow)
            N_ADDED += 1

            # Print status update
            STATUS = f"{COMPLETED} {mdata['timestamp']}, {mdata['loc']}, {basename}"
            print(STATUS.ljust(100), end='\r')

    print(f"\n\n\tAdded {N_ADDED} records")
    print(f"\tSkipped {N_DUP} existing records")
    print(f"\tUnable to process {N_ERR} images")

### Bringing it all together:

In [6]:
DATA_DIR = os.path.join(HOME, 'Data', 'Storage', 'AlertWF')

stations, db_image_paths = query_existing_records()
process_images(DATA_DIR, stations, db_image_paths)

100.0% 2021-03-13 11:19:58, (-120.0, -3.0, 2.0), Brightwood_2021-03-13T11:20:18.685063.png          png

	Added 3424 records
	Skipped 28664 existing records
	Unable to process 0 images


In [7]:
# import vaex
# with open('/home/jmp/foobar.csv', 'w') as csvfile:
#     csvfile.write('"foo","bar","baz"\n')
#     csvfile.write('1,2,3\n')
#     csvfile.write('4,5,6\n')
#     csvfile.write('7,8,9\n')

# vcsv = vaex.from_csv('/home/jmp/foobar.csv', chunk_size=5_000_000)
# for i, d in enumerate(vcsv):
#     fname = f"/home/jmp/Documents/AlertWildfire/foobar{i}.hdf5"
#     if os.path.exists(fname):
#         os.remove(fname)
#     d.export_hdf5(fname)

# dv = vaex.open("/home/jmp/Documents/AlertWildfire/foobar0.hdf5")
# print(dv)