In [1]:
import boto3, glob, json, os, random, re, subprocess
import numpy as np
from scipy import stats, signal
from datetime import datetime
from PIL import Image
import pytesseract
import matplotlib.pyplot as plt
import seaborn

%matplotlib inline
seaborn.set_theme()

In [2]:
from tools.db_util import get_sql_engine, SQL
SQL_ENGINE = get_sql_engine()
%load_ext sql
%sql $SQL_ENGINE.url

## Un-archive some data

The next cells were used to transfer imagery that I previously captured using the brute-force Selenium scraping method into the same format as the more recent method (direct download). Namely, they are converted to jpg format, a lower-resolution version of each one is stored alongside the full-res one, and the filename time stamp is updated to reflect the time of the image capture, rather than the time of local acquisition.

This only needs to happen for old images in the Brightwood dataset. When I moved to the new download format I renamed all of the existing data to prefix with 'Archive' rather than 'AlertWF'.

## Now, updating the database

In [151]:
# %%sql
# ALTER TABLE stations DROP COLUMN elevation;
# ALTER TABLE stations ADD elevation_km FLOAT;
# ALTER TABLE stations CHANGE name id VARCHAR(255)
# ALTER TABLE stations ADD `name` VARCHAR(255);
# ALTER TABLE stations ADD `state` VARCHAR(2);

 * mysql+pymysql://jmp:***@172.17.0.3:3306/AlertWildfire
0 rows affected.
0 rows affected.
0 rows affected.
0 rows affected.


[]

In [152]:
#%sql SELECT * FROM stations LIMIT 2

 * mysql+pymysql://jmp:***@172.17.0.3:3306/AlertWildfire
2 rows affected.


id,lon,lat,elevation_km,name,state
Axis-Aeneas,-119.622,48.7435,1.569,,
Axis-AlabamaHills1,-118.09,36.5657,1.383,,


In [3]:
params = dict(stationid=SQL.String, time_stamp=SQL.DateTime, path=SQL.String)

insert_image_query = SQL.text('''
        INSERT INTO images (`station`, `time_stamp`, `path`)
        VALUES (:stationid, :time_stamp, :path);
    ''').bindparams(*[SQL.bindparam(p, type_=t) for p,t in params.items()])

import boto3, os, json, re
from datetime import datetime

HOME = os.path.expanduser('~')
secrets_path = os.path.join(HOME, 'Documents', 'secrets.json')
with open(secrets_path,'r') as secrets_file:
    secrets = json.load(secrets_file)
    BUCKET_NAME = secrets['aws']['bucket']
    PREFIX = secrets['aws']['prefix']

s3_resource = boto3.resource('s3')
bucket = s3_resource.Bucket(BUCKET_NAME)

regex_path = re.compile(r'([^/]*)/([^/]*.jpg)$')
regex_date = re.compile(r'(\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d-\d\d:\d\d)')

def func():
    n=0
    with SQL_ENGINE.connect() as conn:
        img_res = conn.execute('SELECT path FROM images')
        img_paths = [record[0] for record in img_res.fetchall()]

        tx = conn.begin()
        for obj in bucket.objects.filter(Prefix=PREFIX):
            if ('small' in obj.key) or (obj.key in img_paths):
                continue
            match = regex_path.search(obj.key)
            if match:
                station, fname = match.groups()
                stamp = regex_date.search(fname).groups()[0]

                dtstamp = datetime.fromisoformat(stamp)
                stationid = f'Axis-{station}'

                row = dict(stationid=stationid, time_stamp=dtstamp, path=obj.key)
                conn.execute(insert_image_query, row)

                n += 1
        tx.commit()
    print('Inserted', n, 'records')

%time func()

Inserted 37835 records
CPU times: user 1min 56s, sys: 1.85 s, total: 1min 57s
Wall time: 4min 1s
