In [37]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN, HDBSCAN, OPTICS
from sklearn.metrics.pairwise import haversine_distances

# Important for conversion of lat-long based distance to meters
RADIUS_OF_EARTH_AT_SPACE_NEEDLE = 6366.512563943 # km

In [16]:
# Helper functions
def extract_yearly_data(df, dirname, base_filename, year_range):
    dirpath = os.path.join(os.getcwd(), dirname)
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)
    path = os.path.join(dirpath, base_filename)
    for year in year_range:
        year_df = df.loc[df["Year"] == year]
        year_df.to_csv(path + "_year_" + str(year) + ".csv", index=False)
    return


def meters_to_hav(meters, R=RADIUS_OF_EARTH_AT_SPACE_NEEDLE):
    """Converts a distance in meters to haversine distance"""
    hav = meters / (R * 1000)
    return hav


def get_range(df, col):
    min_val = df[col].min()
    max_val = df[col].max()
    return min_val, max_val


def linearly_interpolate(x, old_interval, new_interval):
    a, b = old_interval
    c, d = new_interval
    new_x = c + (x - a) * ((d - c) / b - a)
    return new_x

In [17]:
# Distance metrics I was playing with
def euclidean_dist(x, y):
    return np.sqrt(np.sum((x - y) ** 2))


def haversine_np(lon1, lat1, lon2, lat2, R=RADIUS_OF_EARTH_AT_SPACE_NEEDLE):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    km = R * c
    return km


def linearized_haversine(a, b, R=RADIUS_OF_EARTH_AT_SPACE_NEEDLE):
    a_rad = a * (2 * np.pi / 360)
    b_rad = b * (2 * np.pi / 360)
    x_1, y_1 = a_rad
    x_2, y_2 = b_rad
    delta_x = R * np.cos(y_1) * (x_2 - x_1)
    delta_y = R * (y_2 - y_1)
    return np.sqrt(delta_x**2 + delta_y**2)

In [18]:
# Load the crime data
crime_df = pd.read_csv("SPD_Crime_Data__2008-Present.csv")

In [19]:
# Drop the crimes with missing time, category, or spatial data
crime_df = crime_df.dropna(
    axis=0,
    how="any",
    subset=["Offense Start DateTime", "Offense Parent Group", "Longitude", "Latitude"],
    inplace=False,
)

# Filter crimes with wacky position data
crime_df = crime_df.loc[(crime_df["Latitude"] > 47) & (crime_df["Latitude"] < 48) ]
crime_df = crime_df.loc[(crime_df["Longitude"] > -123) & (crime_df["Longitude"] < -122)]

# Drop report information to reduce data size
crime_df = crime_df.drop(
    axis=1, columns=["Report Number", "Report DateTime"], inplace=False
)

# Convert time to datetime column
crime_df["Offense Start DateTime"] = pd.to_datetime(crime_df["Offense Start DateTime"])

# Extract offense year, month, day, and time of day
crime_df['Year'] = crime_df['Offense Start DateTime'].dt.year
crime_df['Month'] = crime_df['Offense Start DateTime'].dt.month.astype(int)
crime_df['Day'] = crime_df['Offense Start DateTime'].dt.dayofweek.astype(int)
crime_df['Time'] = crime_df['Offense Start DateTime'].dt.time

# Filter to crimes since 2008
crime_df = crime_df.loc[crime_df["Year"] >= 2008]

# Express lat and long in radians for haversine metric
crime_df["long_rad"] = crime_df["Longitude"].apply(np.radians)
crime_df["lat_rad"] = crime_df["Latitude"].apply(np.radians)

In [20]:
crime_df.head()

Unnamed: 0,Offense ID,Offense Start DateTime,Offense End DateTime,Group A B,Crime Against Category,Offense Parent Group,Offense,Offense Code,Precinct,Sector,...,MCPP,100 Block Address,Longitude,Latitude,Year,Month,Day,Time,long_rad,lat_rad
0,12605873663,2020-02-05 10:10:00,,A,SOCIETY,DRUG/NARCOTIC OFFENSES,Drug/Narcotic Violations,35A,W,Q,...,MAGNOLIA,32XX BLOCK OF 23RD AVE W,-122.385974,47.649387,2020,2,2,10:10:00,-2.136038,0.831639
1,12605598696,2020-02-03 08:00:00,02/04/2020 08:00:00 AM,A,PROPERTY,LARCENY-THEFT,Theft of Motor Vehicle Parts or Accessories,23G,N,J,...,ROOSEVELT/RAVENNA,63XX BLOCK OF 5TH AVE NE,-122.323399,47.675118,2020,2,0,08:00:00,-2.134946,0.832088
2,12605567653,2020-02-02 20:30:00,02/02/2020 09:30:00 PM,A,PROPERTY,ROBBERY,Robbery,120,N,U,...,ROOSEVELT/RAVENNA,26TH AVE NE / NE BLAKELEY ST,-122.299552,47.666384,2020,2,6,20:30:00,-2.13453,0.831935
3,12605174036,2020-02-05 01:17:00,02/05/2020 02:21:00 AM,A,PROPERTY,DESTRUCTION/DAMAGE/VANDALISM OF PROPERTY,Destruction/Damage/Vandalism of Property,290,W,Q,...,MAGNOLIA,22XX BLOCK OF W RAYE ST,-122.384865,47.642927,2020,2,2,01:17:00,-2.136019,0.831526
4,12605081469,2020-02-05 00:51:21,,B,SOCIETY,DRIVING UNDER THE INFLUENCE,Driving Under the Influence,90D,N,B,...,BALLARD SOUTH,NW 46TH ST / 8TH AVE NW,-122.366195,47.662193,2020,2,2,00:51:21,-2.135693,0.831862


In [11]:
# Create yearly extracts if you want
extract_yearly_data(crime_df, "yearly_extracts", "SPD_Crime_Data", range(2008, 2025))

In [21]:
# Save cleaned data
crime_df.to_csv("cleaned_SPD_Crime_Data.csv", index=False)

In [22]:
# Read the cleaned data
crime_df = pd.read_csv("cleaned_SPD_Crime_Data.csv")

In [8]:
# Checking range of lat and long columns
min_long, max_long = get_range(crime_df, "Longitude")
min_lat, max_lat = get_range(crime_df, "Latitude")

In [9]:
x_range = 1000 * haversine_np(min_long, min_lat, max_long, min_lat)
y_range = 1000 * haversine_np(min_long, min_lat, min_long, max_lat)

x_range, y_range

(15724.679469803265, 36253.28281720292)

In [10]:
# A linearized version introduces very little error, if we need it
x_range_lin = 1000 * linearized_haversine(np.array([min_long, min_lat]), np.array([max_long, min_lat]))
y_range_lin = 1000 * linearized_haversine(np.array([min_long, min_lat]), np.array([min_long, max_lat]))

x_range_lin, y_range_lin

(15724.68421289776, 36253.28281720292)

In [24]:
# Test runs on cap hill in 2023
cap_hill_23 = crime_df.loc[
    (crime_df["Year"] == 2023) & (crime_df["MCPP"] == "CAPITOL HILL")
]
cap_hill_23.head()

Unnamed: 0,Offense ID,Offense Start DateTime,Offense End DateTime,Group A B,Crime Against Category,Offense Parent Group,Offense,Offense Code,Precinct,Sector,...,MCPP,100 Block Address,Longitude,Latitude,Year,Month,Day,Time,long_rad,lat_rad
974994,41318760607,2023-01-18 00:00:00,01/18/2023 11:59:00 PM,A,PERSON,ASSAULT OFFENSES,Intimidation,13C,E,E,...,CAPITOL HILL,6XX BLOCK OF HARVARD AVE E,-122.321983,47.624694,2023,1,2,00:00:00,-2.134921,0.831208
975041,41315958431,2023-01-30 10:30:00,,B,SOCIETY,TRESPASS OF REAL PROPERTY,Trespass of Real Property,90J,E,C,...,CAPITOL HILL,14XX BLOCK OF E HOWELL ST,-122.313516,47.617626,2023,1,0,10:30:00,-2.134774,0.831084
975092,41312435583,2023-01-29 21:40:00,01/29/2023 10:12:00 PM,B,SOCIETY,DRIVING UNDER THE INFLUENCE,Driving Under the Influence,90D,E,E,...,CAPITOL HILL,14XX BLOCK OF 11TH AVE,-122.318132,47.613513,2023,1,6,21:40:00,-2.134854,0.831013
975178,41288788167,2023-01-28 19:20:00,01/29/2023 03:20:00 AM,A,PROPERTY,LARCENY-THEFT,Theft From Building,23D,E,E,...,CAPITOL HILL,1XX BLOCK OF 10TH AVE E,-122.31953,47.619333,2023,1,5,19:20:00,-2.134879,0.831114
975187,41288762898,2023-01-28 02:00:00,01/28/2023 02:01:00 AM,A,PROPERTY,LARCENY-THEFT,Theft From Building,23D,E,E,...,CAPITOL HILL,15XX BLOCK OF 11TH AVE,-122.31815,47.614674,2023,1,5,02:00:00,-2.134854,0.831033


In [75]:
# Test precomputing distance matrix---it's fast, which is good
latlong = cap_hill_23[["lat_rad", "long_rad"]].to_numpy()
distances = haversine_distances(latlong)
distances * (RADIUS_OF_EARTH_AT_SPACE_NEEDLE * 1000) # Approx distance in meters between crimes

array([[   0.        , 1009.45075617, 1275.44504381, ...,  343.80890441,
         343.80890441,  341.87047085],
       [1009.45075617,    0.        ,  573.07081297, ..., 1266.67123412,
        1266.67123412, 1114.80616355],
       [1275.44504381,  573.07081297,    0.        , ..., 1599.70320173,
        1599.70320173, 1220.48001123],
       ...,
       [ 343.80890441, 1266.67123412, 1599.70320173, ...,    0.        ,
           0.        ,  618.30479661],
       [ 343.80890441, 1266.67123412, 1599.70320173, ...,    0.        ,
           0.        ,  618.30479661],
       [ 341.87047085, 1114.80616355, 1220.48001123, ...,  618.30479661,
         618.30479661,    0.        ]])

### Grouping of crime types
1. We need to group the crime types somehow. I haven't done it yet though

In [None]:
# Define some interest groups in the future
assault_group = ["ASSAULT OFFENSES"]
theft_group = ["LARCENY-THEFT", "BURGLARY/BREAKING&ENTERING" ]

In [25]:
crime_df.columns

Index(['Offense ID', 'Offense Start DateTime', 'Offense End DateTime',
       'Group A B', 'Crime Against Category', 'Offense Parent Group',
       'Offense', 'Offense Code', 'Precinct', 'Sector', 'Beat', 'MCPP',
       '100 Block Address', 'Longitude', 'Latitude', 'Year', 'Month', 'Day',
       'Time', 'long_rad', 'lat_rad'],
      dtype='object')

### Plan
1. GridSearch---store every labeling in output dataframe for comparison in Tableau

In [133]:
def top_blocks(crime_df, n):
    top_blocks = crime_df.groupby(['Latitude', 'Longitude'])\
        .size()\
        .to_frame(name='count')\
        .reset_index()\
        .sort_values(by=['count'], ascending=False)\
        .head(n)
    return top_blocks["count"].values

top_blocks(cap_hill_23, 20)

array([72, 65, 62, 61, 55, 47, 45, 43, 41, 38, 37, 35, 33, 32, 32, 31, 31,
       29, 25, 23])

In [68]:
meters_to_hav(1000)

0.0001570718646914388

In [186]:
def run_dbscans(X, df, core_threshes, eps_meters=125):
    """DBSCAN grid search -> df + summaries

    Labels stored in output. Write to disk for comparison in Tableau.
    We need to fix epsilon small (100 meters) to constrain cluster size."""
    output_df = df.copy()
    run_summaries = []

    total_crime = X.shape[0]
    eps = meters_to_hav(eps_meters)

    for min_samples in core_threshes:
        print(f"DBSCAN clustering with eps={eps_meters}m, min_samples={min_samples}")
        # Cluster the data and extract the labels
        colname = "db_labs_eps=" + str(eps_meters) + "_ms=" + str(min_samples)
        db = DBSCAN(eps=eps, min_samples=min_samples, metric="haversine").fit(X)
        labels_db = db.labels_
        # Count clusters and noise
        num_clusters_ = len(set(labels_db)) - (1 if -1 in labels_db else 0)
        num_noise_ = list(labels_db).count(-1)
        percent_clustered_ = (total_crime - num_noise_) / total_crime
        print("Estimated number of clusters: %d" % num_clusters_)
        print("Estimated number of noise points: %d" % num_noise_)
        print(f"Estimated percentage of crime captured: {percent_clustered_}\n")
        summary = {
            "type": "DBSCAN",
            "model": db,
            "params": (eps_meters, min_samples),
            "num_clusters": num_clusters_,
            "num_noise": num_noise_,
            "percent_clustered": percent_clustered_,
        }
        # Save the labels in the output df
        run_summaries.append(summary)
        output_df[colname] = labels_db
    return output_df, run_summaries


def run_optics(X, df, core_threshes, max_eps_meters=200):
    """OPTICS grid search -> df + summaries

    Labels stored in output df. Write to disk for comparison in Tableau.
    Similarly, we need to fix a small neighborhood size to limit size."""
    output_df = df.copy()
    run_summaries = []

    total_crime = X.shape[0]
    max_eps = meters_to_hav(max_eps_meters)

    for min_samples in core_threshes:
        print(
            f"OPTICS clustering with max_eps={max_eps_meters}m, min_samples={min_samples}"
        )
        # Cluster the data and extract the labels
        colname = (
            "op_labs_eps="
            + str(max_eps_meters)
            + "_ms="
            + str(min_samples)
        )
        op = OPTICS(
            max_eps=max_eps,
            min_samples=min_samples,
            metric="haversine",
            cluster_method="dbscan",
        ).fit(X)
        labels_op = op.labels_
        # Count clusters and noise
        num_clusters_ = len(set(labels_op)) - (1 if -1 in labels_op else 0)
        num_noise_ = list(labels_op).count(-1)
        percent_clustered_ = (total_crime - num_noise_) / total_crime
        print("Estimated number of clusters: %d" % num_clusters_)
        print("Estimated number of noise points: %d" % num_noise_)
        print(f"Estimated percentage of crime captured: {percent_clustered_}\n")
        summary = {
            "type": "OPTICS",
            "model": op,
            "params": (max_eps_meters, min_samples),
            "num_clusters": num_clusters_,
            "num_noise": num_noise_,
            "percent_clustered": percent_clustered_,
        }
        # Save labels in the output df
        run_summaries.append(summary)
        output_df[colname] = labels_op
    return output_df, run_summaries


def run_hdbscans(X, df, core_threshes, min_cluster_size, cluster_selection_eps_meters=100):
    """OPTICS grid search -> df + summaries

    Labels stored in output df. Write to disk for comparison in Tableau.
    Similarly, we need to fix a small neighborhood size to limit size."""
    output_df = df.copy()
    run_summaries = []

    total_crime = X.shape[0]
    cluster_selection_eps = meters_to_hav(cluster_selection_eps_meters)

    for min_samples in core_threshes:
        print(
            f"HDBSCAN clustering with max_eps={cluster_selection_eps_meters}m, min_samples={min_samples}"
        )
        # Cluster the data and extract the labels
        colname = (
            "hdb_labs_eps="
            + str(cluster_selection_eps_meters)
            + "_ms="
            + str(min_samples)
        )
        hdb = HDBSCAN(
            cluster_selection_epsilon=cluster_selection_eps,
            min_samples=min_samples,
            min_cluster_size=min_cluster_size,
            metric="haversine",
            store_centers="centroid",
        ).fit(X)
        labels_hdb_ = hdb.labels_
        # Count clusters and noise
        num_clusters_ = len(set(labels_hdb_)) - (1 if -1 in labels_hdb_ else 0)
        num_noise_ = list(labels_hdb_).count(-1)
        percent_clustered_ = (total_crime - num_noise_) / total_crime
        print("Estimated number of clusters: %d" % num_clusters_)
        print("Estimated number of noise points: %d" % num_noise_)
        print(f"Estimated percentage of crime captured: {percent_clustered_}\n")
        summary = {
            "type": "HDBSSCAN",
            "model": hdb,
            "params": (cluster_selection_eps_meters, min_samples),
            "num_clusters": num_clusters_,
            "num_noise": num_noise_,
            "percent_clustered": percent_clustered_,
        }
        # Save labels in the output df
        run_summaries.append(summary)
        output_df[colname] = labels_hdb_
    return output_df, run_summaries

In [77]:
# Radians for haversine
X = cap_hill_23[["lat_rad", "long_rad"]]
# Taking top N ensures these blocks will be core points
core_threshes = np.flip(np.unique(top_blocks(cap_hill_23, 10)))

In [88]:
dbscans_df, db_summaries = run_dbscans(X, cap_hill_23, core_threshes, eps_meters=100)

DBSCAN clustering with eps=100m, min_samples=102
Estimated number of clusters: 4
Estimated number of noise points: 1679
Estimated percentage of crime captured: 0.5585064422824086

DBSCAN clustering with eps=100m, min_samples=95
Estimated number of clusters: 5
Estimated number of noise points: 1514
Estimated percentage of crime captured: 0.6018932421772285

DBSCAN clustering with eps=100m, min_samples=92
Estimated number of clusters: 6
Estimated number of noise points: 1346
Estimated percentage of crime captured: 0.6460688929792269

DBSCAN clustering with eps=100m, min_samples=91
Estimated number of clusters: 6
Estimated number of noise points: 1346
Estimated percentage of crime captured: 0.6460688929792269

DBSCAN clustering with eps=100m, min_samples=85
Estimated number of clusters: 6
Estimated number of noise points: 1137
Estimated percentage of crime captured: 0.7010255061793321

DBSCAN clustering with eps=100m, min_samples=77
Estimated number of clusters: 5
Estimated number of nois

In [87]:
optics_df, op_summaries = run_optics(X, cap_hill_23, core_threshes, max_eps_meters=150)

OPTICS clustering with max_eps=100m, min_samples=72
Estimated number of clusters: 6
Estimated number of noise points: 860
Estimated percentage of crime captured: 0.7738627399421509

OPTICS clustering with max_eps=100m, min_samples=65
Estimated number of clusters: 3
Estimated number of noise points: 742
Estimated percentage of crime captured: 0.804890875624507

OPTICS clustering with max_eps=100m, min_samples=62
Estimated number of clusters: 3
Estimated number of noise points: 664
Estimated percentage of crime captured: 0.8254009992111491

OPTICS clustering with max_eps=100m, min_samples=61
Estimated number of clusters: 3
Estimated number of noise points: 664
Estimated percentage of crime captured: 0.8254009992111491

OPTICS clustering with max_eps=100m, min_samples=55
Estimated number of clusters: 3
Estimated number of noise points: 599
Estimated percentage of crime captured: 0.8424927688666842

OPTICS clustering with max_eps=100m, min_samples=47
Estimated number of clusters: 4
Estimat

In [104]:
hdbscans_df, hdb_summaries = run_hdbscans(X, cap_hill_23, core_threshes, cluster_selection_eps_meters=100)

HDBSCAN clustering with max_eps=100m, min_samples=72
Estimated number of clusters: 5
Estimated number of noise points: 1526
Estimated percentage of crime captured: 0.5987378385485144

HDBSCAN clustering with max_eps=100m, min_samples=65
Estimated number of clusters: 5
Estimated number of noise points: 1099
Estimated percentage of crime captured: 0.7110176176702603

HDBSCAN clustering with max_eps=100m, min_samples=62
Estimated number of clusters: 6
Estimated number of noise points: 1037
Estimated percentage of crime captured: 0.7273205364186169

HDBSCAN clustering with max_eps=100m, min_samples=61
Estimated number of clusters: 6
Estimated number of noise points: 991
Estimated percentage of crime captured: 0.7394162503286879

HDBSCAN clustering with max_eps=100m, min_samples=55
Estimated number of clusters: 5
Estimated number of noise points: 1080
Estimated percentage of crime captured: 0.7160136734157244

HDBSCAN clustering with max_eps=100m, min_samples=47
Estimated number of clusters

### Extracting time windows
1. Need to extract endpoints of the form (start_month, start_year), (end_month, end_year)

In [119]:
cap_hill_23.columns

Index(['Offense ID', 'Offense Start DateTime', 'Offense End DateTime',
       'Group A B', 'Crime Against Category', 'Offense Parent Group',
       'Offense', 'Offense Code', 'Precinct', 'Sector', 'Beat', 'MCPP',
       '100 Block Address', 'Longitude', 'Latitude', 'Year', 'Month', 'Day',
       'Time', 'long_rad', 'lat_rad'],
      dtype='object')

Time window extraction needs debugging (endpoint issues due to rounding)

In [147]:
# Extract time window as a series of month, year intervals
def to_month_year(months, base_year):
    year = base_year + int(months/ 12)
    month = (months % 12)
    return month, year


def get_windows(start_year, end_year, length=18, step=6):
    """Generates a sequence of cuts as a 'sliding time window' 
    """
    num_months = (end_year - start_year) * 12
    starts = range(1, num_months, step)
    window_in_months = [(start, start + length - 1) for start in starts]
    windows = [(to_month_year(window[0], start_year), to_month_year(window[1], start_year)) for window in window_in_months]
    return windows

def extract_windows(df, windows):
    """Temp solution. Assumes window will not span more than 2 years."""
    extracts = []
    for window in windows:
        start, end = window
        # Get crimes in first year
        df_filter1 = df.loc[(df["Year"] == start[1]) & (df["Month"] >= start[0])]
        # Get crimes in the second year
        df_filter2 = df.loc[(df["Year"] == end[1]) & (df["Month"] <= end[0])]
        # Concat to windowed df
        df_window = pd.concat([df_filter1, df_filter2], axis=0)
        extracts.append((window, df_window))
    return extracts

# Need to join the endpoints into time ranges
windows = get_windows(2022, 2023, length=18, step=6)
extracts = extract_windows(crime_df.loc[crime_df["MCPP"] == "CAPITOL HILL"], windows)
windows

[((1, 2022), (6, 2023)), ((7, 2022), (0, 2024))]

In [148]:
# Manually extract windows for now
cap_hill = crime_df.loc[crime_df["MCPP"] == "CAPITOL HILL"]

In [149]:
filter1 = cap_hill.loc[cap_hill["Year"] == 2022]
filter2 = cap_hill.loc[(cap_hill["Year"] == 2023) & (cap_hill["Month"] <= 6)]
window1 = pd.concat([filter1, filter2], axis=0)

In [150]:
filter3 = cap_hill.loc[cap_hill["Year"] == 2022 & (cap_hill["Month"] >= 7)]
filter4 = cap_hill.loc[cap_hill["Year"] == 2023]
window2 = pd.concat([filter3, filter4], axis=0)

In [187]:
# Try some runs
windows = [window1, window2]
results = []
summaries = []
for i, window in enumerate(windows):
    # Radians for haversine
    X = window[["lat_rad", "long_rad"]]
    # Taking top N ensures these blocks will be core points
    core_threshes = np.flip(np.unique(top_blocks(window, 10)))
    # run db_scans
    output_df, db_summaries = run_dbscans(X, window, core_threshes, eps_meters=75)
    output_df, op_summaries = run_optics(X, output_df, core_threshes, max_eps_meters=75)
    output_df, hdb_summaries = run_hdbscans(X, output_df, core_threshes, core_threshes[0], cluster_selection_eps_meters=50)
    summaries.append((db_summaries, op_summaries, hdb_summaries))
    results.append(output_df)
    output_df.to_csv(f"win{i}_cap_hill_clusters.csv", index=False)

DBSCAN clustering with eps=75m, min_samples=139
Estimated number of clusters: 12
Estimated number of noise points: 3637
Estimated percentage of crime captured: 0.3880195187615682

DBSCAN clustering with eps=75m, min_samples=120
Estimated number of clusters: 14
Estimated number of noise points: 3108
Estimated percentage of crime captured: 0.47703180212014135

DBSCAN clustering with eps=75m, min_samples=94
Estimated number of clusters: 22
Estimated number of noise points: 2429
Estimated percentage of crime captured: 0.591283863368669

DBSCAN clustering with eps=75m, min_samples=83
Estimated number of clusters: 24
Estimated number of noise points: 2191
Estimated percentage of crime captured: 0.6313309776207303

DBSCAN clustering with eps=75m, min_samples=77
Estimated number of clusters: 22
Estimated number of noise points: 2109
Estimated percentage of crime captured: 0.6451287228672388

DBSCAN clustering with eps=75m, min_samples=75
Estimated number of clusters: 18
Estimated number of noi

In [183]:
windows[0].shape[0] * .005


29.715

In [184]:
df, summaries = run_dbscans(
    windows[0][["lat_rad", "long_rad"]],
    windows[0],
    core_threshes=np.flip(np.unique(top_blocks(windows[0], 10))) + 30,
    eps_meters=75,
)

DBSCAN clustering with eps=75m, min_samples=169
Estimated number of clusters: 6
Estimated number of noise points: 4827
Estimated percentage of crime captured: 0.18778394750126198

DBSCAN clustering with eps=75m, min_samples=150
Estimated number of clusters: 11
Estimated number of noise points: 3898
Estimated percentage of crime captured: 0.3441023052330473

DBSCAN clustering with eps=75m, min_samples=124
Estimated number of clusters: 15
Estimated number of noise points: 3221
Estimated percentage of crime captured: 0.4580178361097089

DBSCAN clustering with eps=75m, min_samples=113
Estimated number of clusters: 15
Estimated number of noise points: 2963
Estimated percentage of crime captured: 0.5014302540804307

DBSCAN clustering with eps=75m, min_samples=107
Estimated number of clusters: 17
Estimated number of noise points: 2877
Estimated percentage of crime captured: 0.5159010600706714

DBSCAN clustering with eps=75m, min_samples=105
Estimated number of clusters: 18
Estimated number of

In [176]:
df.to_csv("test2.csv", index=False)