In [None]:
import json
import os
import re

import pandas as pd

In [None]:
PICKLE_FILE='./policedata_20240825_df.pd'

if os.path.isfile(PICKLE_FILE):
    df = pd.read_pickle(PICKLE_FILE)
else:
    with open('/hostshare/police_victimizations_20240807.csv', 'r', encoding='utf-16-le') as f:
        df = pd.read_csv(f, delimiter='\t')
    pd.to_pickle(df, PICKLE_FILE)

In [None]:
df.head()

In [None]:
subset = df[["ANZSOC Subdivision", "ANZSOC Division", "Year Month", "Area Unit", "Territorial Authority"]]
# Various data cleaning
subset["Area Unit"] = subset["Area Unit"].str.strip('.')
subset["Territorial Authority"] = subset["Territorial Authority"].str.strip('.')
subset[["Month", "Year"]] = subset["Year Month"].str.split(' ', expand=True)
del subset["Year Month"]
subset['Year'] = pd.to_numeric(subset['Year'])

months = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    "December": 12
}

subset['Month'] = subset['Month'].map(lambda a: months[a])
number_regex=re.compile(r'-?\d+(\.\d*)?')

subset[subset['Area Unit'].str.match(number_regex)]['Area Unit'] = subset['Territorial Authority']

In [None]:
subset.to_pickle('./policedata_20240825_cleaned.pd')

In [None]:
subset.head()

In [None]:
year_2024_crime_leaderboard = subset.query('Year==2024').groupby(["ANZSOC Division", "Area Unit"]).size().sort_values(ascending=False)


In [None]:
crimes = {}

for (crime, area), crime_count in year_2024_crime_leaderboard.items():
    crime_list = crimes.setdefault(crime, [])
    crime_list.append((area, crime_count))

for crime, crime_list in crimes.items():
    crime = re.sub('[^A-Za-z ]', '', crime)
    crime_snake = '_'.join(crime.split()).lower()

    with open('../data/leaderboard_2024_divisions/{}.json'.format(crime_snake), 'w') as f:
        json.dump(crime_list, f)