## Imports

In [None]:
import io
from collections import namedtuple
from pathlib import Path
import zipfile

import datamate
import pandas as pd
from tqdm.notebook import tqdm

## Name For This Data

In [None]:
name = "police-data/derived"
name_raw = "police-data/raw"

## Load Necessary Configration

In [None]:
key = datamate.config.get_key_for_name(name=name)

path_data = datamate.config.get_path_data_for_key(key=key)
logger = datamate.logging.get_logger(key=key)

logger.info(f"{key=} loaded for {name=}")
logger.info(f"{path_data=}")

key_raw = datamate.config.get_key_for_name(name=name_raw)
path_data_raw = datamate.config.get_path_data_for_key(key=key_raw)

logger.info(f"Using {path_data_raw=} for {key_raw=}")

## Data Information

- Main link is at https://data.police.uk/about/
- Each zip file contains three years worth of data so there is a lot of overlap
- There are three file types:
    1. `street` - street level crime data, the one we are most interested
    1. `outcomes` - outcomes from the crimes
    1. `stop-and-search` - records about stop and search carried by the police
- For `street` crime, we have the following columns:
  | Field | Meaning |
  | -- | -- |
  | Reported by | The force that provided the data about the crime. |
  | Falls within | At present, also the force that provided the data about the crime. This is currently being looked into and is likely to change in the near future. |
  | Longitude and Latitude | The anonymised coordinates of the crime. See [Location Anonymisation](https://data.police.uk/about/#location-anonymisation) for more information. |
  | LSOA code and LSOA name | References to the [Lower Layer Super Output Area](http://data.gov.uk/dataset/lower_layer_super_output_area_lsoa_boundaries) that the anonymised point falls into, according to the LSOA boundaries provided by the Office for National Statistics. |
  | Crime type | One of the crime types listed in the [Police.UK FAQ](https://www.police.uk/pu/about-police.uk-crime-data/). |
  | Last outcome category | A reference to whichever of the outcomes associated with the crime occurred most recently. For example, this crime's 'Last outcome category' would be 'Formal action is not in the public interest'. |
  | Context | A field provided for forces to provide additional human-readable data about individual crimes. Currently, for newly added CSVs, this is always empty. |
- Crime categories, mappings can be found at https://www.police.uk/SysSiteAssets/police-uk/media/downloads/crime-categories/police-uk-category-mappings.csv:
  - **All crime**: Total for all categories.
  - **Anti-social behaviour**: Includes personal, environmental and nuisance anti-social behaviour.
  - **Bicycle theft**: Includes the taking without consent or theft of a pedal cycle.
  - **Burglary**: Includes offences where a person enters a house or other building with the intention of stealing.
  - **Criminal damage and arson**: Includes damage to buildings and vehicles and deliberate damage by fire.
  - **Drugs**: Includes offences related to possession, supply and production.
  - **Other crime**: Includes forgery, perjury and other miscellaneous crime.
  - **Other theft**: Includes theft by an employee, blackmail and making off without payment.
  - **Possession of weapons**: Includes possession of a weapon, such as a firearm or knife.
  - **Public order**: Includes offences which cause fear, alarm or distress.
  - **Robbery**: Includes offences where a person uses force or threat of force to steal.
  - **Shoplifting**: Includes theft from shops or stalls.
  - **Theft from the person**: Includes crimes that involve theft directly from the victim (including handbag, wallet, cash, mobile phones) but without the use or threat of physical force.
  - **Vehicle crime**: Includes theft from or of a vehicle or interference with a vehicle.
  - **Violence and sexual offences**: Includes offences against the person such as common assaults, Grievous Bodily Harm and sexual offences.

In [None]:
partition_cols = ["year", "month", "policeForce"]

all_columns = [
    "idCrime",
    "date",
    "crimeType",
    "longitude",
    "latitude",
    "location",
    "lsoaCode",
    "lsoaName",
    "lastOutcomeCategory",
    "reportedByPoliceForce",
    "fallsWithinPoliceForce",
] + partition_cols

map_headers = {
    "Crime ID": "idCrime",
    "Month": "month",
    "Reported by": "reportedByPoliceForce",
    "Falls within": "fallsWithinPoliceForce",
    "Longitude": "longitude",
    "Latitude": "latitude",
    "Location": "location",
    "LSOA code": "lsoaCode",
    "LSOA name": "lsoaName",
    "Crime type": "crimeType",
    "Last outcome category": "lastOutcomeCategory",
    "Context": "context",
}

def clean_data(
    df_raw: pd.DataFrame,
    police_force: str,
    map_headers: dict[str, str],
    all_columns: list[str],
) -> pd.DataFrame:
    return (
        df_raw
        .rename(columns=map_headers)
        .assign(
            idCrime=lambda x: x["idCrime"].astype("string"),
            date=lambda x: pd.to_datetime(x["month"]),
            crimeType=lambda x: x["crimeType"].astype("category").cat.rename_categories(lambda s: s.lower().replace(" ", "-")),
            longitude=lambda x: pd.to_numeric(x["longitude"]),
            latitude=lambda x: pd.to_numeric(x["latitude"]),
            location=lambda x: x["location"].astype("string"),
            lsoaCode=lambda x: x["lsoaCode"].astype("string"),
            lsoaName=lambda x: x["lsoaName"].astype("string"),
            lastOutcomeCategory=lambda x: x["lastOutcomeCategory"].astype("string"),
            year=lambda x: x["date"].dt.year.astype("category"),
            month=lambda x: x["date"].dt.month.astype("category"),
            policeForce=police_force,
            reportedByPoliceForce=lambda x: x["reportedByPoliceForce"].astype("category"),
            fallsWithinPoliceForce=lambda x: x["fallsWithinPoliceForce"].astype("category"),
        )
        .astype({"policeForce": "category"})
        .drop(columns=["context"])
        [all_columns]
    )


# # Testing function:
# with zipfile.ZipFile(path_data_raw.joinpath("2022-12.zip")) as zfile:
#     police_force = "metropolitan"
#     df = clean_data(
#         df_raw=pd.read_csv(io.BytesIO(zfile.read(f"2022-12/2022-12-{police_force}-street.csv"))), 
#         police_force=police_force,
#         map_headers=map_headers,
#         all_columns=all_columns,
#     )
# df

In [None]:
path_derived = path_data.joinpath("data.parquet")
logger.info(f"Data will be saved to {path_derived=}")

In [None]:
MonthPoliceForce = namedtuple("MonthPoliceForce", ["month", "police_force"])

files_read = set()
street_csv = "street.csv"
for zip_folder in tqdm(sorted(path_data_raw.iterdir(), reverse=True)):
    logger.info(f"Working on {zip_folder=}")
    with zipfile.ZipFile(zip_folder) as zfile:
        for file in zfile.filelist[::-1]:
            if file.filename.endswith(street_csv):
                month = file.filename.split("/")[0]
                police_force = file.filename.split("/")[1].split(f"{month}-", 1)[-1].split(f"-{street_csv}", 1)[0]
                month_police_force = MonthPoliceForce(month=month, police_force=police_force)
                if month_police_force not in files_read:
                    logger.debug(f"Reading {file.filename=} with {month_police_force=}")
                    df_raw = pd.read_csv(io.BytesIO(zfile.read(file)))
                    
                    df_clean = clean_data(
                        df_raw=df_raw,
                        police_force=police_force,
                        map_headers=map_headers,
                        all_columns=all_columns,
                    )
                    logger.debug(f"{df_raw.shape=}, {df_clean.shape=}")
                    
                    df_clean.to_parquet(path=path_derived, partition_cols=partition_cols)
                    files_read.add(month_police_force)

In [None]:
%%time

for year in range(2010, 2023):
    df = pd.read_parquet(
        path=path_derived, 
        filters=[("year", "=", year)],
        columns=["date", "idCrime", "crimeType", "longitude", "latitude", "policeForce"],
    )
    logger.info(f"{year=}")
    logger.info(f"{df.columns=}")
    logger.info(f"{df.shape=}")
    logger.info(f"{df.memory_usage().sum()=:,}")
    logger.info(f"""{df.describe()=
}""")
    logger.info(f"""{df.describe(include='category')=
}""")