In [23]:
import os
import time
from tqdm import tqdm

import shapely
import numpy as np
import pandas as pd
import geopandas as gpd

import seaborn as sns
import matplotlib.pyplot as plt

In [24]:
data_path = "/Users/administrator/Documents/Projects/sf-crime-exploration/data/SFPD_Crime_Data_Full.csv"
train_data = pd.read_csv(data_path, parse_dates=["Date"])

print(train_data.head())
print("The amount of training data is {}.".format(len(train_data)))

        Date   Time Day of Week       Category    District   Latitude  \
0 2011-07-01  08:00      FRIDAY       WARRANTS    NORTHERN  37.802151   
1 2005-10-18  14:30     TUESDAY  DRUG/NARCOTIC  TENDERLOIN  37.779944   
2 2005-01-29  13:45    SATURDAY  VEHICLE THEFT     BAYVIEW  37.737576   
3 2011-06-02  02:52    THURSDAY   NON-CRIMINAL     CENTRAL  37.803109   
4 2003-02-01  08:00    SATURDAY  VEHICLE THEFT     BAYVIEW  37.724556   

    Longitude  
0 -122.439758  
1 -122.414318  
2 -122.388799  
3 -122.414354  
4 -122.401097  
The amount of training data is 2534378.


The DataFrame contains the following columns:

* Date: day the incident occured, in yyyy-mm-dd format.
* Time: time the incident occured, in military time.
* Day of Week: day of the week the incident occured.
* Category: the classification of the incident.
* District: the police district in which the incident occured.
* Latitude: the latitude of the location where the incident occured.
* Longitude: the longitude of the location where the incident occured.

In [25]:
start_time = time.time()
num_duplicates = train_data.duplicated().sum()
end_time = time.time()

print("Entire operation took {} seconds.".format(end_time - start_time))
print("There are {} duplicate entries.".format(num_duplicates))

Entire operation took 1.7886619567871094 seconds.
There are 179089 duplicate entries.


There are 179089 duplicate entries that should be removed.

In [26]:
train_data.drop_duplicates(inplace=True)

Let's analyze the coordinates of the incidents to see if there are any irregularities. Since there are a lot of points to be examined, we will actually just check that the coordinates are contained within the bounding boxes of San Francisco county.

The bounding box is given by:
* BBox East Longitude: -122.28178
* BBox West Longitude: -123.173825
* BBox North Latitude: 37.929824
* BBox South Latitude: 37.63983

In [27]:
def validate_coordinates(coords):
    """Determines if the location represented by the coordinates is contained in the San Francisco county bounding box.

    Parameters
    ----------
    coords : tuple (int, int)
        The coordinates in (longitude, latitude) representing the location to verify.
    
    Returns
    -------
    valid : bool
        Returns ``True`` if the location is contained in the San Francisco county bounding box. Returns ``False`` otherwise.

    """
    x = coords[0]
    y = coords[1]
    if (x >= -123.173825 and x <= -122.28178) and (y >= 37.63983 and y <= 37.929824):
        return True
    else:
        return False

In [35]:
invalid_coord_indices = list()

train_coords = train_data[["Longitude", "Latitude"]]
for idx, row in tqdm(enumerate(train_coords.itertuples()), total=len(train_coords)):
    row = row[1:3] # The first entry contains the Pandas index, which we don't need.
    valid = validate_coordinates(row)
    if valid == False:
        invalid_coord_indices.append(idx)

print("\nThere are {} invalid entries.".format(len(invalid_coord_indices)))

100%|██████████| 2355289/2355289 [00:03<00:00, 687690.66it/s]
There are 127 invalid entries.



There are 127 invalid entries, let's explore them.

In [40]:
print(train_data[["Longitude", "Latitude"]].iloc[invalid_coord_indices])

         Longitude  Latitude
2137        -120.5      90.0
8478        -120.5      90.0
12880       -120.5      90.0
20953       -120.5      90.0
28594       -120.5      90.0
...            ...       ...
2025503     -120.5      90.0
2034164     -120.5      90.0
2064119     -120.5      90.0
2079990     -120.5      90.0
2086098     -120.5      90.0

[127 rows x 2 columns]


Looks like for all of these entries, the Longitude is set to -120.5 and the Latitude is set to 90.0.

We can either throw these data points away or we can consolidate them as follows. Find the average coordinates of the district where the crime occured. We'll consolidate them.