In [1]:
import boto3, glob, json, os, random, re, subprocess
import numpy as np
from scipy import stats, signal
from datetime import datetime
from PIL import Image
import pytesseract
import pymysql
import sqlalchemy as SQL
from urllib.parse import quote_plus as QP
import matplotlib.pyplot as plt
import seaborn

%matplotlib inline
seaborn.set_theme()

HOME = os.path.expanduser('~')

In [2]:
def get_sql_url():
    sql_secrets_path = os.path.join(HOME, 'Documents', 'sql_secrets.json')
    with open(sql_secrets_path,'r') as SECRETS:
        SECRETS = json.load(SECRETS)
        SQL_PASSWD = SECRETS['PASSWD']
        SQL_USER = SECRETS['USER']
        SQL_HOST = SECRETS['HOST']
        SQL_PORT = SECRETS['PORT']
        SQL_DB = SECRETS['DB']

    return f"mysql+pymysql://{SQL_USER}:{QP(SQL_PASSWD)}@{SQL_HOST}:{SQL_PORT}/{SQL_DB}"

SQL_URL = get_sql_url()
SQL_ENGINE = SQL.create_engine(SQL_URL)
%load_ext sql
%sql $SQL_URL

## Un-archive some data

The next cells were used to transfer imagery that I previously captured using the brute-force Selenium scraping method into the same format as the more recent method (direct download). Namely, they are converted to jpg format, a lower-resolution version of each one is stored alongside the full-res one, and the filename time stamp is updated to reflect the time of the image capture, rather than the time of local acquisition.

This only needs to happen for old images in the Brightwood dataset. When I moved to the new download format I renamed all of the existing data to prefix with 'Archive' rather than 'AlertWF'.

In [None]:
%time
'''
We need to fetch actual metadata from the mysql database
in order to rename images using their actual timestamp.
We'll do that all at once, and dump the results into a json
file to be loaded again in the following step. This is
mostly just to avoid needing to open the local DB when the
meat of this conversion is processed on an ec2 instance.
'''

selector = SQL.text('SELECT DateTime, Path FROM Metadata;')
with SQL_ENGINE.connect().execution_options(autocommit=True) as conn, \
        open('migrate.json','w') as migrate:
    res = conn.execute(selector)
    keyvals = {key[:-4]: dt.isoformat()+'-08:00' for dt, key in res.fetchall()}
    migrate.write(json.dumps(keyvals))

In [None]:
#!/usr/bin/env python3

'''
For migrating images that were scraped using the
Selenium brute-force method (screenshotting the actual
webpage) into the new format.

Note: Run this on an ec2 instance to save on s3 tx fees.

scp -i ~/.ssh/piwik_key migrate.json ec2-user@ec2-34-216-82-235.us-west-2.compute.amazonaws.com:
'''

import boto3, json, subprocess
from datetime import datetime

BUCKET_NAME = 'storage-9iudgkuqwurq6'
PREFIX = 'Archive/Brightwood'

TMP_ORIG = '/tmp/original.png'
TMP_CONV = '/tmp/converted.jpg'
TMP_COMP = '/tmp/compressed.jpg'

s3_resource = boto3.resource('s3')
s3_client = boto3.client('s3')
bucket = s3_resource.Bucket(BUCKET_NAME)

with open('migrate.json','r') as mfile:
    migrate = json.load(mfile)
    
for obj in bucket.objects.filter(Prefix=PREFIX):
    obj_key = obj.key
    obj_base = (obj.key.split('/')[-1])[:-4]
    if obj_base in migrate:
        # Fetch the object to a local temporary file
        s3_client.download_file(BUCKET_NAME, obj_key, TMP_ORIG)

        # Convert the object to (true) jpg format and compress
        subprocess.call(['convert', '-identify', TMP_ORIG, TMP_CONV])
        subprocess.call(['convert', '-identify', TMP_ORIG, '-resize', '@250000', TMP_COMP])

        # Create new object key
        tstamp = migrate[obj_base]
        conv_key = f'AlertWF/Brightwood/Brightwood_{tstamp}.jpg'
        comp_key = f'AlertWF/Brightwood/Brightwood_{tstamp}-small.jpg'

        # Upload converted objects
        s3_client.upload_file(TMP_CONV, BUCKET_NAME, conv_key)
        s3_client.upload_file(TMP_COMP, BUCKET_NAME, comp_key)

        # Delete original object
        s3_resource.Object(BUCKET_NAME, obj_key).delete()