In [1]:
import os
import glob
import time

import json
import pyproj 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data_path = "/Users/administrator/Documents/Projects/abq_crime/data/abq_police_incidents_test.json"

In [3]:
# Load the data.
with open(data_path) as json_file:
    raw_data = json.load(json_file)
    json_file.close()

# Determine the number of data points.
num_data = len(raw_data["features"])

A subset of the geometric data is unavailable, so we should list those indices.

In [4]:
# Find the indices of missing geometric data.
missing_geometry_indices = list()

time_start = time.time()
for i in range(num_data):
    if ("geometry" not in list(raw_data["features"][i].keys())):
        missing_geometry_indices.append(i)
time_end = time.time()
print("Entire operation took {} seconds.".format(time_end - time_start))

# See how many indices are missing.
print("There are {} entries without geometric data.".format(len(missing_geometry_indices)))

# Get some example indices.
print("Some example indices: {}".format(missing_geometry_indices[0:5]))

Entire operation took 0.027006149291992188 seconds.
There are 1124 entries without geometric data.
Some example indices: [69, 86, 108, 129, 158]


In [5]:
def to_latlong(point, input_standard="epsg:3857", output_standard="epsg:4326"):
    """ Converts a point from ERSI:102100 to EPSG:4326 (lat/long). """
    transformer = pyproj.Transformer.from_crs(input_standard, output_standard)
    (x_proj, y_proj) = transformer.transform(point[0], point[1])

    return [x_proj, y_proj]

In [6]:
# Extract the data into a list.
extracted_data = list()

time_start = time.time()
for i in range(num_data):
    # Extract the attribute data
    attributes = raw_data["features"][i]["attributes"]
    attribute_data = [attributes[key] for key in attributes.keys()]
    
    # Extract the geometric data. We will need to convert to lat/long before we store it.
    if i in missing_geometry_indices:
        projected_data = [np.nan, np.nan]
    else:
        # Obtain the geometries and then convert it to latlong.
        geometries = raw_data["features"][i]["geometry"]
        geometry_data = [geometries[key] for key in geometries.keys()]
        projected_data = to_latlong(point=(geometry_data[0], geometry_data[1]))
        #projected_data = [geometries[key] for key in geometries.keys()]

    attribute_data += projected_data
    extracted_data.append(attribute_data)
time_end = time.time()
print("Entire operation took {} seconds.".format(time_end - time_start))

Entire operation took 2698.9444539546967 seconds.


In [7]:
# Test cell for collect some data
extracted_data[0:5]

[[45420062,
  'I25 NORTHBOUND SE / COAL AV SE',
  'TRAFFIC STOP',
  1600819200000,
  35.078681961404726,
  -106.63735016938548],
 [45420063,
  'COORS BL NW / SEQUOIA RD NW',
  'TRAFFIC STOP',
  1600819200000,
  35.121312948507,
  -106.7016212161628],
 [45420064,
  'COORS BL NW / EAGLE RANCH RD NW',
  'DIRECT TRAFFIC',
  1600819200000,
  35.174464215283486,
  -106.67368513706435],
 [45420065,
  'BRYN MAWR DR NE / MENAUL BL NE',
  'TRAFFIC STOP',
  1600819200000,
  35.109200978367255,
  -106.60911953450203],
 [45420066,
  'I25 NORTHBOUND SE / COAL AV SE',
  'TRAFFIC STOP',
  1600819200000,
  35.078681961404726,
  -106.63735016938548]]

In [8]:
# Pickle it.
import pickle

with open("./processed_data.pkl", "wb") as f:
    pickle.dump(extracted_data, f)

In [11]:
# Load it
with open("./processed_data.pkl", "rb") as f:
    test_data = pickle.load(f)
test_data[0]

[45420062,
 'I25 NORTHBOUND SE / COAL AV SE',
 'TRAFFIC STOP',
 1600819200000,
 35.078681961404726,
 -106.63735016938548]