hnl-ai · tyliec · Jan 4, 2022 · Jan 3, 2022 · Jan 3, 2022 · Jan 4, 2022
diff --git a/README.md b/README.md
@@ -52,10 +52,13 @@ Everyday (with `cron`!), the script is run (`cd scrape && python3 main.py`) to s
 
 After we download the file, we prepare it for image cropping and OCR. To do this, we
 
-1. Convert all the PDF file's pages into images ([Example Page](docs/2021-04-08_page_1.png))
-2. Vertically concat all the page images into one long image, cropping the top and the bottom out so we only contain arrest records ([Example Image](docs/concat.png))
-3. Crop each individual arrest record using the location of pixels ([Example Image](docs/record_10.png))
-4. Crop each portion of the arrest record by the categories we want to parse ([Example Image - Race, Age, and Sex](docs/record_10_race_age_and_sex.png))
-5. Use OCR([PyTesseract](https://pypi.org/project/pytesseract/)) to parse the text
-
-We then upload the data to AWS DynamoDB. Using Flask and DynamoDB's boto3 module, data is served to the [HPDStats website](https://hpdstats.com/).
+1. Split the PDF into individual pages ([Example Page PDF](docs/example_page.pdf))
+2. Convert all the PDF file's pages into images ([Example Page Image](docs/example_page.png))
+3. Vertically concat all the page images into one long image, cropping the top and the bottom out so we only contain arrest records ([Example Vertically Concatted Image](docs/example_vconcat.png))
+4. Crop each individual arrest record using the location of pixels ([Example Record Image](docs/example_record.png))
+5. Crop each portion of the arrest record by the categories we want to parse:
+  - [Example Record - Age Category](docs/example_record_age.png)
+  - [Example Record - Ethnicities Category](docs/example_record_ethnicities.png)
+6. Use OCR([PyTesseract](https://pypi.org/project/pytesseract/)) to parse the text
+
+We then upload the data to AWS DynamoDB. Using Flask and DynamoDB's boto3 module, data is served to the [HPDStats website](https://hpdstats.com/). An example of the artifacts generated from the script can be viewed here: [Example Artifacts](https://honolulupd-artifacts.s3.us-west-1.amazonaws.com/2022-01-01.zip)
diff --git a/app/static/index.js b/app/static/index.js
@@ -387,7 +387,7 @@ function fillOfficerData(records) {
     const officers = {};
 
     for (const record of records) {
-        const officer = record.officers[0];
+        const officer = record.arrest_officer[0];
         if (officers[officer]) {
             officers[officer].push(record);
         } else {
@@ -402,7 +402,7 @@ function fillOfficerData(records) {
         const matchingRecords = [];
         const arrestedEthnicities = {};
         for (const record of records) {
-            if (record.officers.includes(officer)) {
+            if (record.arrest_officer.includes(officer)) {
                 matchingRecords.push(record);
 
                 for (const ethnicity of record.ethnicities) {

diff --git a/app/static/map/map.js b/app/static/map/map.js
@@ -58,13 +58,14 @@ fetch('/api/records')
                         radius: 200,
                         color: getColor(differenceInDays(new Date(record.date), new Date()))
                     });
+
                     circle.addTo(map);
                     circle.bindPopup(`
                     <div class="ui bulleted list">
                         <div class="item"><b>Age:</b> ${record.age}</div>
                         <div class="item"><b>Ethnicities:</b> ${record.ethnicities.join(', ')}</div>
                         <div class="item"><b>Location:</b> ${record.locations[0].address}</div>
-                        <div class="item"><b>Officers:</b> ${record.officers.join(', ')}</div>
+                        <div class="item"><b>Officers:</b> ${record.arrest_officer.join(', ')}</div>
                         <div class="item"><a href="https://honolulupd-records.s3-us-west-1.amazonaws.com/${record.imageId}"><b>View Record</b></a></div>
                     </div>
                 `);

diff --git a/app/static/table/table.js b/app/static/table/table.js
@@ -12,7 +12,7 @@ $(document).ready(() => {
       { "data": "age" },
       { "data": "sex" },
       { "data": "ethnicities" },
-      { "data": "officers" },
+      { "data": "arrest_officer" },
       {
         "data": (row, type, val, meta) => {
           return row.locations.length ? row.locations[0].address : "";

diff --git a/docs/2021-04-08_page_1.png b/docs/2021-04-08_page_1.png
diff --git a/docs/example_page.png b/docs/example_page.png
diff --git a/docs/example_record.png b/docs/example_record.png
diff --git a/docs/example_record_age.png b/docs/example_record_age.png
diff --git a/docs/example_record_ethnicities.png b/docs/example_record_ethnicities.png
diff --git a/docs/example_vconcat.png b/docs/example_vconcat.png
diff --git a/docs/record_10.png b/docs/record_10.png
diff --git a/docs/record_10_race_age_and_sex.png b/docs/record_10_race_age_and_sex.png
diff --git a/scrape/constants.py b/scrape/constants.py
@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+"""Constants for the project."""
+
+ARREST_LOG_URL = 'https://www.honolulupd.org/information/arrest-logs/'
+
+PDF_ROOT_DIRECTORY = 'pdfs'
+PDF_TMP_DIRECTORY = PDF_ROOT_DIRECTORY + '/tmp'
+
+LOGS_BUCKET_NAME = 'honolulupd-arrest-logs'
+RECORDS_BUCKET_NAME = 'honolulupd-records'
+ARTIFACTS_BUCKET_NAME = 'honolulupd-artifacts'
+
+RECORDS_TABLE_NAME = 'honolulupd.org-records'
diff --git a/scrape/imgs/.gitignore b/scrape/imgs/.gitignore
diff --git a/scrape/main.py b/scrape/main.py
@@ -1,90 +1,199 @@
 # -*- coding: utf-8 -*-
 """The scrape module, containing the HPD arrest log scraper/parser."""
 
+import os
 import uuid
+
 import cv2
 
+import constants
+
 from utils.ddb import insert_item
 from utils.imgs import (
     concat_images,
     crop_image,
     convert_pdf_to_png,
     retrieve_image_dimensions,
-    retrieve_record_starting_points
+    retrieve_record_starting_points,
+    retrieve_offense_starting_points
 )
-from utils.ocr import read_text
-from utils.pdfs import split_into_pages
+from utils.ocr import clean_text, read_text, correct_ethnicities
+from utils.pdfs import split_into_pages, initialize_pdf_directory
 from utils.parse import (
-    get_record_categories,
-    get_dimensions_from_category,
-    handle_text_assignment
+    get_static_record_categories,
+    get_dynamic_record_categories,
+    get_dimensions_from_static_category,
+    get_dimensions_from_dynamic_category
 )
 from utils.s3 import upload_file
 from utils.scrape import check_for_update
+from utils.location import geolocate_location
+from utils.files import copy_file, zip_directory
 
-IMAGE_DIRECTORY = 'imgs'
-RECORDS_BUCKET_NAME = 'honolulupd-records'
-RECORDS_TABLE_NAME = 'honolulupd.org-records'
 
 def main(pdf_files):
     """The entry-point main function."""
     for pdf_file in pdf_files:
-        pdf_file_pages = split_into_pages(pdf_file)
-        img_files = []
+        pdf_directory_registry = initialize_pdf_directory(
+            os.path.splitext(pdf_file[5:])[0])
+        copy_file_path = f"{pdf_directory_registry['root']}/index.pdf"
+        copy_file(pdf_file, copy_file_path)
+        manifest_file_path = f"{pdf_directory_registry['root']}/manifest.txt"
+        with open(manifest_file_path, 'w', encoding='utf-8') as manifest_file:
+            manifest_file.write('ARREST LOG MANIFEST\n')
+
+        pdf_file_pages = split_into_pages(
+            pdf_directory_registry['pages'], pdf_file)
+
+        image_files = []
         for page in pdf_file_pages:
-            img_files.append(convert_pdf_to_png(page))
+            image_files.append(
+                convert_pdf_to_png(
+                    pdf_directory_registry['images'],
+                    page
+                )
+            )
 
-        for i, img_file in enumerate(img_files):
+        for i, img_file in enumerate(image_files):
+            # Crop the headers and footers out of each page
             width, height = retrieve_image_dimensions(img_file)
             if i == 0:
                 left, top, right, bottom = 0, 270, width, height - 100
             else:
                 left, top, right, bottom = 0, 160, width, height - 90
             crop_image(img_file, img_file, (left, top, right, bottom))
-            img_files[i] = cv2.imread(img_file)
-        concat_image = concat_images(img_files)
-
+            image_files[i] = cv2.imread(img_file)
+        concat_image = concat_images(
+            pdf_directory_registry['images'],
+            image_files
+        )
         record_starting_points = retrieve_record_starting_points(concat_image)
+
+        print(f'# of records detected: {len(record_starting_points)}')
+        with open(manifest_file_path, 'a', encoding='utf-8') as manifest_file:
+            manifest_file.write(f'# of records: {len(record_starting_points)}\n')
+
         width, height = retrieve_image_dimensions(concat_image)
+        records = []
         for i, starting_point in enumerate(record_starting_points):
             left, top, right, bottom = 0, starting_point - 5, width, height
             if i + 1 != len(record_starting_points):
                 bottom = record_starting_points[i + 1] - 5
-            cropped_record_filename = f'{IMAGE_DIRECTORY}/record_{str(i)}.png'
-            crop_image(concat_image, cropped_record_filename,
-                       (left, top, right, bottom))
-
-            categories = get_record_categories()
-            record = {}
-
-            width, height = retrieve_image_dimensions(cropped_record_filename)
-            for category in categories:
-                top, bottom = 0, height
-                left, right = get_dimensions_from_category(category, width)
-                cropped_category_filename = f'{IMAGE_DIRECTORY}/record_{str(i)}_{category}.png'
+            records_subdir = pdf_directory_registry['records']
+            cropped_record_filename = f'{records_subdir}/{str(i + 1)}.png'
+            records.append(crop_image(
+                concat_image,
+                cropped_record_filename,
+                (left, top, right, bottom)
+            ))
+
+        total_offenses_count = 0
+        for i, record_image_path in enumerate(records):
+            print(record_image_path)
+            record_data = {}
+
+            # START - ITERATE OVER STATIC CATEGORIES - START
+            static_categories = get_static_record_categories()
+
+            for category in static_categories:
+                top, bottom, left, right = get_dimensions_from_static_category(
+                    category)
+                cropped_category_filename = \
+                    f'{constants.PDF_TMP_DIRECTORY}/record_{str(i + 1)}_{category}.png'
                 crop_image(
-                    cropped_record_filename,
+                    record_image_path,
                     cropped_category_filename,
                     (left,
                      top,
                      right,
                      bottom))
                 category_text = read_text(cropped_category_filename)
-                record = handle_text_assignment(
-                    category, category_text, record)
+                record_data[category] = clean_text(category_text)
+
+            # END - ITERATE OVER STATIC CATEGORIES - END
+
+            # START - ITERATE OVER DYNAMIC CATEGORIES - START
+
+            offenses_starting_points = retrieve_offense_starting_points(
+                record_image_path)
+            total_offenses_count += len(offenses_starting_points)
+            print(f'# of offenses detected: {len(offenses_starting_points)}')
+
+            offenses = []
+            for j, offense_starting_point in enumerate(
+                    offenses_starting_points):
+                left, top, right, bottom = 0, offense_starting_point - 5, width, height
+                if j + 1 != len(offenses_starting_points):
+                    bottom = offenses_starting_points[j + 1] - 5
+                cropped_offense_filename = \
+                    f'{constants.PDF_TMP_DIRECTORY}/record_{str(i + 1)}_offense_{str(j + 1)}.png'
+                offenses.append(crop_image(
+                    record_image_path,
+                    cropped_offense_filename,
+                    (left, top, right, bottom)
+                ))
+
+            dynamic_categories = get_dynamic_record_categories()
+            for category in dynamic_categories:
+                record_data[category] = []
+
+            for offense_file_path in offenses:
+                for category in dynamic_categories:
+                    top, bottom, left, right = get_dimensions_from_dynamic_category(
+                        category)
+                    cropped_category_filename = \
+                        f'{constants.PDF_TMP_DIRECTORY}/record_{str(i + 1)}_{category}.png'
+                    crop_image(
+                        offense_file_path,
+                        cropped_category_filename,
+                        (left,
+                         top,
+                         right,
+                         bottom))
+                    category_text = read_text(cropped_category_filename)
+                    record_data[category].append(clean_text(category_text))
+
+            # END - ITERATE OVER DYNAMIC CATEGORIES - END
+
+            # Convert "ethnicities" string to an array of strings
+            # "White, Tongan" -> ["White", "Tongan"]
+            record_data['ethnicities'] = record_data['ethnicities'].split(',')
+            record_data['ethnicities'] = correct_ethnicities(
+                record_data['ethnicities'])
+
+            # Geolocate the locations
+            locations = []
+            for location in record_data['location_of_arrest']:
+                geolocated_location = geolocate_location(location)
+                locations.append(geolocated_location)
+            record_data['locations'] = locations
 
             img_file_id = str(uuid.uuid4())
             img_filename = f'{img_file_id}.png'
-            with open(cropped_record_filename, 'rb') as file:
-                upload_file(RECORDS_BUCKET_NAME, img_filename, file, 'image/png')
-            record['id'] = str(uuid.uuid4())
-            record['imageId'] = img_filename
-            print(record)
-            insert_item(RECORDS_TABLE_NAME, {
+            with open(record_image_path, 'rb') as file:
+                upload_file(
+                    constants.RECORDS_BUCKET_NAME,
+                    img_filename,
+                    file,
+                    'image/png')
+            record_data['id'] = str(uuid.uuid4())
+            record_data['imageId'] = img_filename
+            print(record_data)
+            insert_item(constants.RECORDS_TABLE_NAME, {
                 'date': pdf_file.split('/')[1][0:-4],
-                **record
+                **record_data
             })
 
+        with open(manifest_file_path, 'a', encoding='utf-8') as manifest_file:
+            manifest_file.write(f'# of offenses: {total_offenses_count}\n')
+
+        output_zip_file = f"{constants.PDF_TMP_DIRECTORY}/{pdf_file.split('/')[1][0:-4]}"
+        zip_directory(output_zip_file, pdf_directory_registry['root'])
+        with open(output_zip_file + '.zip', 'rb') as file:
+            upload_file(constants.ARTIFACTS_BUCKET_NAME, pdf_file.split(
+                '/')[1][0:-4] + '.zip', file, 'application/zip')
+
+
 def retrieve_files():
     """Wrapper around the check for new PDF files."""
     pdf_files = check_for_update()
@@ -95,4 +204,3 @@ def retrieve_files():
 
 if __name__ == '__main__':
     main(retrieve_files())
-
diff --git a/scrape/pdfs/.gitignore b/scrape/pdfs/.gitignore
@@ -1,3 +1,4 @@
 *
 */
 !.gitignore
+!tmp
diff --git a/scrape/pdfs/tmp/.gitignore b/scrape/pdfs/tmp/.gitignore
@@ -0,0 +1,3 @@
+# *
+# */
+# !.gitignore
diff --git a/scrape/utils/__init__.py b/scrape/utils/__init__.py
@@ -7,3 +7,4 @@
 from .pdfs import *
 from .s3 import *
 from .scrape import *
+from .files import *
diff --git a/scrape/utils/ddb.py b/scrape/utils/ddb.py
@@ -4,11 +4,15 @@
 import boto3
 import botocore
 
+
 def check_if_item_exists(table_name, key):
     """Checks if the given key exists in a table."""
     dynamodb = boto3.resource('dynamodb')
     table = dynamodb.Table(table_name)
 
+    if not key:
+        return False
+
     try:
         response = table.get_item(Key=key)
     except botocore.exceptions.ClientError as error:
@@ -19,6 +23,7 @@ def check_if_item_exists(table_name, key):
             return False
         return response['Item']
 
+
 def insert_item(table_name, item):
     """Inserts the given item into in a table."""
     dynamodb = boto3.resource('dynamodb')

diff --git a/scrape/utils/files.py b/scrape/utils/files.py
@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+"""The files utility module."""
+
+import shutil
+
+
+def copy_file(src, dst):
+    """Copies a file from src to dst."""
+    shutil.copyfile(src, dst)
+
+
+def zip_directory(zip_name, dir_name):
+    """Zips up an archive given a directory."""
+    shutil.make_archive(zip_name, 'zip', dir_name)