In [23]:
import sys
import numpy as np
import pandas as pd
import os
import json
from functools import reduce
from datetime import datetime
from pandas.io.json import json_normalize
from itertools import chain, starmap

## Extract and Clean Crime Data

In [2]:
# Define directories
CRIME_RECORDS_DIR = '../crime_data'
CRIME_DATA_DF_LIST = []
WEATHER_DATA_DIR = '../weather_data'

In [3]:
# Open each file, add the crime records to a list: CRIME_DATA_DF_LIST
def preprocess_file(file):
    with open("{}/{}".format(CRIME_RECORDS_DIR,file), 'r') as f:
        data = f.read()
    json_data = json.loads(json.loads(data))['dataEvents'] #returns list of crime records
    if(len(json_data) > 0):  
        tmp_df = pd.DataFrame(json_data)
        CRIME_DATA_DF_LIST.append(tmp_df)

In [4]:
# Iterate through all files in crime directory
for file in os.listdir(CRIME_RECORDS_DIR):
    if(file.endswith(".json")):
        # Preprocess file
        preprocess_file(file)

In [5]:
crime_df = pd.concat(CRIME_DATA_DF_LIST)

In [6]:
column_mapping = {"view34":"Incident_ID", "view35":"Crime","view36":"Location","view62":"Date_Time","view84":"Agency","view81":"Longitude","view82":"Latitude","view85":"Accuracy","view96":"Address"}
crime_df.rename(columns=column_mapping,inplace=True)
crime_df.drop(columns=['view86','view174','view177','view182','view175'],inplace=True)

In [7]:
# # UNIT TESTING: DateTime to Unix Timestamp
# # Source: https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior
# s = "Mar 28, 2013 3:33 PM"
# d = datetime.strptime(s, '%b %d, %Y %I:%M %p')
# print(int(d.timestamp()))

In [9]:
# Add extra date column (where date is a Unix Timestamp)
crime_df['Date_Time_Unix'] = crime_df['Date_Time']
crime_df['Date_Time_Unix'].apply(lambda x: int(datetime.strptime(x, '%b %d, %Y %I:%M %p').timestamp()))

0     1364499180
1     1364410800
2     1364400600
3     1364326200
4     1364305800
         ...    
40    1531209600
41    1530756960
42    1530726300
43    1530724980
44    1530724920
Name: Date_Time_Unix, Length: 3810, dtype: int64

## Add Weather Data

In [38]:
# Source https://towardsdatascience.com/flattening-json-objects-in-python-f5343c794b10
def flatten_json(y):
    out = {}
    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x
    flatten(y)
    return out

In [51]:
def preprocess_file(file):
    records_dict_list = []
    with open("{}/{}".format(WEATHER_DATA_DIR,file), 'r') as f:
        data = f.read()
    json_data = json.loads(data)
    for record in json_data:
        tmp_df = flatten_json(record)
        records_dict_list.append(tmp_df)
    # Create dataframe of all dict records
    
    print(len(records_dict_list))

In [52]:
preprocess_file('weather_troy.json')

52583
