# Data extraction for Assignment 2A
First, we import the modules that we need for the data extraction. After importing the modules, we will read the dataset into memory.

In [None]:
import numpy as np
import pandas as pd
import json
from collections import defaultdict

# define categories and years which we will use later for exclusion of unwanted data
categories = ['PROSTITUTION', 'VEHICLE THEFT']
years = [2003, 2015]

print("Reading in data...")
df = pd.read_csv('sfpd_dataset.csv', usecols=['Date', 'Category', 'PdDistrict'])
df['Date'] = pd.to_datetime(df['Date'],infer_datetime_format=True)
print("Done!")

print("We have", len(df),
        "total number of incidents in PROSTITUTION and VEHICLE THEFT")

We initiate counters for counting the number of crimes in the categories
for each district and counting the number of total crime incidents in the districts.
We then iterate through the dataset and count the number
of crimes. We then split out loaded dataframe into one dataframe for each year.

In [None]:
# counter for counting total crime for the year district-wise
total_crime_counter = defaultdict(lambda : defaultdict(int))
# counter for counting categorical crime on a per year basis
counters = defaultdict(lambda : defaultdict(lambda : defaultdict(int)))

print("Preparing dataframes: group by year")
dfs = {
        years[0]: df[df['Date'].dt.year == years[0]],
        years[1]: df[df['Date'].dt.year == years[1]],
}
print("Done!")

Now we are ready for counting the total crime in the districts for each year in all categories.

In [None]:
print("Counting total crime for each year...")
for year, df in dfs.items():
    for _, row in df.iterrows():
        total_crime_counter[year][row['PdDistrict']] += 1
print("Done!")

We will continue on and prepare new dataframes by excluding every category that is not in our list of `categories`.
After the exclusion, we will do another count.

In [None]:
print("Preparing dataframes: exclude categories")
dfs[years[0]] = dfs[years[0]][dfs[years[0]]['Category'].isin(categories)]
dfs[years[1]] = dfs[years[1]][dfs[years[1]]['Category'].isin(categories)]
print("Done!")

print("Counting categorical crime for each year on a per district basis...")
for year, df in dfs.items():
    for _, row in df.iterrows():
        year = row['Date'].year
        counters[year][row['PdDistrict']][row['Category']] += 1
print("Done!")

We no longer need the counting functionality, so we prepare to save our dataset
by converting it to dictionaries, which can easily be converted to the JSON
format and saved to a file.

In [None]:
print("Converting the counters to JSON objects...")
for year, counter in counters.items():
    districts = []
    counter = dict(counter)
    for district, crime_counts in counter.items():
        crime_counts = dict(crime_counts)
        districts.append({
            'district': district,
            'total_crime': total_crime_counter[year][district],
            **crime_counts, # merge crime_counts dict, python3.5+ only
        })

    # We save the new dataset to a JSON file
    with open(str(year) + '.json', 'w') as f:
        json.dump(districts, f) # dumps the python dictionary as a JSON object to f
        print("Converted and saved as ", str(year), ".json!")