In [None]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
import os

import folium
from folium.plugins import HeatMap
from folium.plugins import HeatMapWithTime

from pyproj import CRS, Transformer

MAPS_ENABLED = True

data_url = "https://data.lacity.org/api/views/2nrs-mtv8/rows.csv?accessType=DOWNLOAD"

# download data
if not os.path.isfile("data.csv"):
    urllib.request.urlretrieve(data_url, "data.csv")

"""
Use the store_object function to make local caching of the data.
"""
def store_object(filename, instruction):
    # check if file was cached before and read it
    if os.path.isfile(filename):
        return pickle.load(open(filename, 'rb'))
    # run instruction and save it as pickle then return object
    else:
        obj = instruction()
        pickle.dump(obj, open(filename, 'wb'))
        return obj
    
data = store_object("data.pkl", lambda: pd.read_csv("data.csv"))

In [None]:
data.head()

In [None]:
data.columns

### Cleaning data

In [None]:
# find missing values
missing_values = data.isnull().sum()
missing_values

In [None]:
# print((data['Vict Age'] < 0).sum()) # There are 89 records with negative age - removing it
data = data[data["Vict Age"] >= 0]

In [None]:
# convert 'DATE OCC' to datetime
data["DATE OCC"] = pd.to_datetime(data["DATE OCC"], format="%m/%d/%Y %I:%M:%S %p")

# drop everything after 2024/01/01 as data is incomplete
data = data[data["DATE OCC"] < "2024-01-01"]

### Data expansion

In [None]:
police_stations_data = pd.read_csv("lapd_stations.csv")
police_stations_data.head()

input_epsg = CRS("EPSG:2229") # NAD83
output_epsg = CRS("EPSG:4326") # WGS 84

# convert coordinates ["X"] and ["Y"] to Latitude and Longitude in police_stations_data
transformer = Transformer.from_crs(input_epsg, output_epsg, always_xy=True)
police_stations_data["LAT"], police_stations_data["LON"] = transformer.transform(police_stations_data["Y"].values, police_stations_data["X"].values)

# drop ["X"], ["Y"], ["OBJECTID"] columns
police_stations_data = police_stations_data.drop(columns=["X", "Y", "OBJECTID"])

# move colum "PREC" to be first
police_stations_data = police_stations_data[["PREC", "DIVISION", "LOCATION", "LAT", "LON"]].sort_values(by="PREC")

police_stations_data.head()


### Data mining

In [None]:
# maps common

popup_name_neighborhoods = folium.GeoJsonPopup(fields=["name"], labels=True)
geojson_neighborhoods = folium.GeoJson(
    "la_neighborhoods.geojson",
    style_function=lambda feature: {
        "fillColor": "#abcbff",
        "color": "black",
        "weight": 1,
        "dashArray": "3, 3",
    },
    highlight_function=lambda x: {"fillColor": "#458cff"},
    smooth_factor=2.0,
    name="LA neighborhoods",
    popup=popup_name_neighborhoods,
    zoom_on_click=True
)

# use alias that map PREC number to police station name using police_stations
popup_name_districts = folium.GeoJsonPopup(fields=["APREC"], labels=True)
geojson_districts = folium.GeoJson(
    "lapd_districts.geojson",
    style_function=lambda feature: {
        "fillColor": "#abcbff",
        "color": "black",
        "weight": 1,
        "dashArray": "3, 3",
    },
    highlight_function=lambda x: {"fillColor": "#458cff"},
    smooth_factor=2.0,
    name="LA districts",
    popup=popup_name_districts,
    zoom_on_click=True
)

In [None]:
# Some statistics about victims age depends on sex
data[data["Vict Sex"] == "M"]["Vict Age"].describe()

In [None]:
data[data["Vict Sex"] == "F"]["Vict Age"].describe()

In [None]:
data["AREA NAME"].value_counts().plot(kind="bar")
plt.title("Number of crimes for specific area")
plt.xlabel("Area name")
plt.ylabel("Number of crimes")
plt.show()

In [None]:
# global heatmap of crimes

if MAPS_ENABLED:
    map = folium.Map(location=[34.0522, -118.2437], zoom_start=10, tiles="CartoDB positron")
    geojson_neighborhoods.add_to(map)

    heat_data = [[row["LAT"], row["LON"]] for index, row in data.iterrows()]
    HeatMap(heat_data).add_to(map)

    map

In [None]:
# map of police stations and districts with crime count

if MAPS_ENABLED:
    map = folium.Map(location=[34.0522, -118.2437], zoom_start=10, tiles="CartoDB positron")


    colormap = folium.LinearColormap(
        colors=["green", "yellow", "orange", "red"],
        vmin=data["AREA"].value_counts().min(),
        vmax=data["AREA"].value_counts().max(),
        caption="Number of crimes",
    )
    colormap.add_to(map)

    geojson_crimes = folium.GeoJson(
        "lapd_districts.geojson",
        style_function=lambda feature: {
            "fillColor": colormap(data["AREA"].value_counts().get(feature["properties"]["PREC"], 0)),
            "color": "black",
            "weight": 1,
            "dashArray": "3, 3",
        },
        smooth_factor=2.0,
        popup=folium.GeoJsonPopup(fields=["APREC"], labels=True)
    )
    geojson_crimes.add_to(map)

    for index, row in police_stations_data.iterrows():
        folium.Marker(
            location=[row["LAT"], row["LON"]],
            popup=row["LOCATION"],
            icon=folium.Icon(color="darkblue", icon="info-sign"),
        ).add_to(map)

    map

In [None]:
print("Most popular crime:\n", data["Crm Cd"].value_counts()[:5])
# Here we can see that the most common crime is stealing vehicle.

In [None]:
d1 = data[data["AREA NAME"] == "Harbor"]
d2 = data[data["AREA NAME"] == "Central"]
d3 = data[data["AREA NAME"] == "Newton"]
# Grupujemy dane według miesiąca i roku oraz zliczamy liczbę wystąpień przestępstw
count_by_month_year1 = d1.groupby(data["DATE OCC"].dt.to_period("M")).size()
count_by_month_year2 = d2.groupby(data["DATE OCC"].dt.to_period("M")).size()
count_by_month_year3 = d3.groupby(data["DATE OCC"].dt.to_period("M")).size()

count_by_month_year1.plot(marker="o", linestyle="-", figsize=(12, 6), label="Harbor")
count_by_month_year2.plot(marker="o", linestyle="-", figsize=(12, 6), label="Central")
count_by_month_year3.plot(marker="o", linestyle="-", figsize=(12, 6), label="Pacific")

plt.title("Number of occurrences of crimes for specific areas")
plt.xlabel("Month and year")
plt.ylabel("Number of occurrences")
plt.legend()
plt.show()

In [None]:
# heat map of crimes over time by month

if MAPS_ENABLED:
    map = folium.Map(location=[34.0522, -118.2437], zoom_start=10, tiles="CartoDB positron")
    geojson_neighborhoods.add_to(map)

    time_index = data["DATE OCC"].dt.to_period("M").unique()
    time_index = sorted(time_index)

    heat_data = []
    for time in time_index:
        heat_data.append([[row["LAT"], row["LON"]] for index, row in data[data["DATE OCC"].dt.to_period("M") == time].iterrows()])

    # map period to string yyyy-mm
    HeatMapWithTime(heat_data, index=[str(x) for x in time_index], max_opacity=0.8).add_to(map)

    map