In [1]:
import os
import time
from tqdm import tqdm

import shapely
import numpy as np
import pandas as pd
import geopandas as gpd

import seaborn as sns
import matplotlib.pyplot as plt

### Dataset 1

In [2]:
path = "/Users/administrator/Documents/Projects/sf-crime-exploration/data/SFPD_Crime_Data_2003_2018.csv"
dataframe = pd.read_csv(path)

dataframe.columns

Index(['PdId', 'IncidntNum', 'Incident Code', 'Category', 'Descript',
       'DayOfWeek', 'Date', 'Time', 'PdDistrict', 'Resolution', 'Address', 'X',
       'Y', 'location', 'SF Find Neighborhoods 2 2',
       'Current Police Districts 2 2', 'Current Supervisor Districts 2 2',
       'Analysis Neighborhoods 2 2', 'DELETE - Fire Prevention Districts 2 2',
       'DELETE - Police Districts 2 2', 'DELETE - Supervisor Districts 2 2',
       'DELETE - Zip Codes 2 2', 'DELETE - Neighborhoods 2 2',
       'DELETE - 2017 Fix It Zones 2 2',
       'Civic Center Harm Reduction Project Boundary 2 2',
       'Fix It Zones as of 2017-11-06  2 2', 'DELETE - HSOC Zones 2 2',
       'Fix It Zones as of 2018-02-07 2 2',
       'CBD, BID and GBD Boundaries as of 2017 2 2',
       'Areas of Vulnerability, 2016 2 2',
       'Central Market/Tenderloin Boundary 2 2',
       'Central Market/Tenderloin Boundary Polygon - Updated 2 2',
       'HSOC Zones as of 2018-06-05 2 2', 'OWED Public Spaces 2 2',
       

Check for NaN values for each of the columns that we are interested in.

In [3]:
len(dataframe[dataframe["Category"].isna()])

0

In [4]:
len(dataframe[dataframe["DayOfWeek"].isna()])

0

In [5]:
len(dataframe[dataframe["Date"].isna()])

0

In [6]:
len(dataframe[dataframe["Time"].isna()])

0

In [7]:
len(dataframe[dataframe["PdDistrict"].isna()])

1

There is one missing entry in the PdDistrict. Let's see what it is.

In [8]:
dataframe[dataframe["PdDistrict"].isna()]

Unnamed: 0,PdId,IncidntNum,Incident Code,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,...,Fix It Zones as of 2017-11-06 2 2,DELETE - HSOC Zones 2 2,Fix It Zones as of 2018-02-07 2 2,"CBD, BID and GBD Boundaries as of 2017 2 2","Areas of Vulnerability, 2016 2 2",Central Market/Tenderloin Boundary 2 2,Central Market/Tenderloin Boundary Polygon - Updated 2 2,HSOC Zones as of 2018-06-05 2 2,OWED Public Spaces 2 2,Neighborhoods 2
1775753,16601857306244,166018573,6244,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Sunday,01/17/2016,23:54,,NONE,...,,,,,,,,,,


Though the district information is missing, the latitude and longitude information is not missing which means we can recover it if we have the Polygon representing the neighborhoods.

In [9]:
dataframe[dataframe["PdDistrict"].isna()][["X", "Y"]]

Unnamed: 0,X,Y
1775753,-122.413352,37.708202


In [10]:
len(dataframe[dataframe["X"].isna()])

0

In [11]:
len(dataframe[dataframe["Y"].isna()])

0

Let's check to see how many duplicate entries there are.

In [12]:
dataframe.duplicated().sum()

0

There are no duplicate entries.

Let's examine the coordinates to ensure that we do not have any outliers. The outliers will be the coordinates outside of the following bounding box:
* BBox East Longitude: -122.28178
* BBox West Longitude: -123.173825
* BBox North Latitude: 37.929824
* BBox South Latitude: 37.63983

In [13]:
def validate_coordinates(coords):
    """Determines if the location represented by the coordinates is contained within the San Francisco county bounding box.

    Parameters
    ----------
    coords : tuple (int, int)
        The coordinates in (longitude, latitude) representing the location to verify.

    Returns
    -------
    valid : bool
        Returns ``True`` if the location is contained within the San Francisco county bounding box. Returns ``False`` otherwise.

    """
    x = coords[0]
    y = coords[1]
    if (x >= -123.173825 and x <= -122.28178) and (y >= 37.623983 and y <= 37.929824):
        return True
    else:
        return False

In [14]:
invalid_coord_indices = list()
coordinates = dataframe[["X", "Y"]]

for idx, row in tqdm(enumerate(coordinates.itertuples()), total=len(coordinates)):
    row = row[1:3]
    valid = validate_coordinates(row)
    if valid == False:
        invalid_coord_indices.append(idx)

print("\nThere are {} invalid entries".format(len(invalid_coord_indices)))

100%|██████████| 2160953/2160953 [00:05<00:00, 401125.68it/s]
There are 142 invalid entries



Let's check out some of the XY coordinates of the invalid entries.

In [15]:
dataframe[["X", "Y"]].iloc[invalid_coord_indices]

Unnamed: 0,X,Y
2137,-120.5,90.0
8478,-120.5,90.0
12880,-120.5,90.0
20953,-120.5,90.0
28594,-120.5,90.0
...,...,...
2025504,-120.5,90.0
2034165,-120.5,90.0
2064120,-120.5,90.0
2079991,-120.5,90.0


The invalid entries all have the coordinates (-120.5, 90.0)

### Dataset 2

In [16]:
path = "/Users/administrator/Documents/Projects/sf-crime-exploration/data/SFPD_Crime_Data_2018_Present.csv"
dataframe = pd.read_csv(path)

dataframe.columns

Index(['Incident Datetime', 'Incident Date', 'Incident Time', 'Incident Year',
       'Incident Day of Week', 'Report Datetime', 'Row ID', 'Incident ID',
       'Incident Number', 'CAD Number', 'Report Type Code',
       'Report Type Description', 'Filed Online', 'Incident Code',
       'Incident Category', 'Incident Subcategory', 'Incident Description',
       'Resolution', 'Intersection', 'CNN', 'Police District',
       'Analysis Neighborhood', 'Supervisor District', 'Latitude', 'Longitude',
       'point', 'SF Find Neighborhoods', 'Current Police Districts',
       'Current Supervisor Districts', 'Analysis Neighborhoods',
       'HSOC Zones as of 2018-06-05', 'OWED Public Spaces',
       'Central Market/Tenderloin Boundary Polygon - Updated',
       'Parks Alliance CPSI (27+TL sites)', 'ESNCAG - Boundary File',
       'Areas of Vulnerability, 2016'],
      dtype='object')

Check for some NaN values in the categories that we are interested in. Also check for duplicates.

In [17]:
dataframe.duplicated().sum()

0

In [18]:
len(dataframe[dataframe["Incident Date"].isna()])

0

In [19]:
len(dataframe[dataframe["Incident Category"].isna()])

267

In [20]:
dataframe[dataframe["Incident Category"].isna()][["Incident Category", "Incident Subcategory", "Incident Description"]]

Unnamed: 0,Incident Category,Incident Subcategory,Incident Description
1235,,,"Public Health Order Violation, Notification"
2091,,,"Public Health Order Violation, Notification"
2157,,,"Public Health Order Violation, Notification"
2364,,,"Driving, Sideshow/Street Racing"
2451,,,"Driving, Sideshow/Street Racing"
...,...,...,...
392967,,,"Public Health Order Violation, Notification"
392983,,,"Public Health Order Violation, Notification"
393428,,,Military Ordinance
393553,,,"Public Health Order Violation, Notification"


Looks like most of the entries that are missing the Category are mostly public health order violations or smaller ordinances. These are not exactly "crimes" so it is OK to drop them. Let's see how many instances of this there are.

In [21]:
dataframe[dataframe["Incident Category"].isna()]["Incident Description"].unique()

array(['Public Health Order Violation, Notification',
       'Driving, Sideshow/Street Racing', 'Auto Impounded',
       'Public Health Order Violation, After Notification',
       'Sexual Assault, Aggravated, of Child',
       'Service of Documents Related to a Civil Drug Abatement and/or Public Nuisance Action',
       'Theft, Boat', 'Military Ordinance',
       'Crimes Involving Receipts or Titles',
       'Procurement, Pimping, & Pandering',
       'Gun Violence Restraining Order',
       'Assault, Commission of While Armed', 'Theft, Animal, Att.',
       'Gun Violence Restraining Order Violation'], dtype=object)

In [22]:
dataframe[dataframe["Incident Category"].isna()]["Incident Description"].value_counts()

Public Health Order Violation, Notification                                             146
Public Health Order Violation, After Notification                                        41
Driving, Sideshow/Street Racing                                                          33
Sexual Assault, Aggravated, of Child                                                     10
Military Ordinance                                                                        9
Service of Documents Related to a Civil Drug Abatement and/or Public Nuisance Action      8
Theft, Boat                                                                               6
Gun Violence Restraining Order                                                            4
Auto Impounded                                                                            4
Procurement, Pimping, & Pandering                                                         2
Theft, Animal, Att.                                                             

For these entries, we can just list them as "Other" for the "Incident Category" column.

In [23]:
missing_indices = dataframe[dataframe["Incident Category"].isna()].index

In [24]:
for index in missing_indices:
    dataframe.loc[index, "Incident Category"] = "Other"

In [25]:
missing_indices

Int64Index([  1235,   2091,   2157,   2364,   2451,   2497,   3327,   4584,
              5936,   6068,
            ...
            382427, 384682, 386140, 390094, 390695, 392967, 392983, 393428,
            393553, 393899],
           dtype='int64', length=267)

In [26]:
dataframe.iloc[missing_indices]["Incident Category"]

1235      Other
2091      Other
2157      Other
2364      Other
2451      Other
          ...  
392967    Other
392983    Other
393428    Other
393553    Other
393899    Other
Name: Incident Category, Length: 267, dtype: object

Or maybe instead of writing "Other" we can use the incident code.

In [27]:
len(dataframe[dataframe["Incident Code"].isna()])

0

Aha! So there are no missing incident codes. Therefore it should be possible for us to use this information to recover the incident category.

Let's check some of the incident codes for the missing entries and connect them using the following dataset. 

https://data.sfgov.org/Public-Safety/Reference-Police-Department-Incident-Code-Crosswal/ci9u-8awy

In [28]:
dataframe.iloc[missing_indices][["Incident Code", "Incident Description"]]

Unnamed: 0,Incident Code,Incident Description
1235,27400,"Public Health Order Violation, Notification"
2091,27400,"Public Health Order Violation, Notification"
2157,27400,"Public Health Order Violation, Notification"
2364,65021,"Driving, Sideshow/Street Racing"
2451,65021,"Driving, Sideshow/Street Racing"
...,...,...
392967,27400,"Public Health Order Violation, Notification"
392983,27400,"Public Health Order Violation, Notification"
393428,12075,Military Ordinance
393553,27400,"Public Health Order Violation, Notification"


Unfortunately, there are a bunch of incident codes that are not mapped through the Police Department, so we should just stick with the "Other" designation for crimes that have no Incident Category.

Let's continue checking for null values, with the Police District column.

In [29]:
len(dataframe[dataframe["Police District"].isna()])

0

Looks like there are no missing entries in the Police Districts column.

Let's take a look at the Latitude/Longitude pairs to see if they are valid entries.

In [32]:
invalid_coord_indices = list()
coordinates = dataframe[["Longitude", "Latitude"]]

for idx, row in tqdm(enumerate(coordinates.itertuples()), total=len(coordinates)):
    row = row[1:3]
    valid = validate_coordinates(row)
    if valid == False:
        invalid_coord_indices.append(idx)

print("\nThere are {} invalid entries".format(len(invalid_coord_indices)))

100%|██████████| 394025/394025 [00:00<00:00, 498847.14it/s]
There are 20332 invalid entries



There are a whopping 20332 invalid entries! Let's examine them.

In [36]:
dataframe.iloc[invalid_coord_indices][["Longitude", "Latitude"]]

Unnamed: 0,Longitude,Latitude
0,,
1,,
2,,
33,,
66,,
...,...,...
393835,,
393837,,
393854,,
393858,,


Hmm, interesting, they just have NaN values. Mayhaps we can just replace them with the center of mass of the police district polygons.

Let's examine if these entries are missing information in the Police District column. Additionally, let's see how many of these entries are "Out of SF"

In [39]:
dataframe.iloc[invalid_coord_indices]["Police District"]

0         Out of SF
1              Park
2              Park
33         Richmond
66         Southern
            ...    
393835         Park
393837         Park
393854     Northern
393858      Taraval
393946     Southern
Name: Police District, Length: 20332, dtype: object

In [44]:
len([idx for idx, entry in enumerate(dataframe.iloc[invalid_coord_indices]["Police District"] == "Out of SF") if entry == True])

6797

Of the 20332 invalid coordinates, 6797 of them are Outside of San Francisco County. There is no remedy for this so we need to just remove these entries.

The other entries, on the other hand, can be remedied by making their coordinates the center of mass of their police district polygons.