In [68]:
# Import required libraries for data handling and mapping
import os
from glob import glob

import folium
import geopandas as gpd
import pandas as pd


In [69]:
# Import, append, and deduplicate all crime incident files (2019-2025)
data_folder = '../data/raw/'
csv_files = sorted(glob(os.path.join(data_folder, 'Crime_Incidents_in_20*.csv')))
print(f'Found CSV files: {csv_files}')
if not csv_files:
    raise FileNotFoundError('No crime incident CSV files found in the expected folder.')
dfs = [pd.read_csv(f) for f in csv_files]
crime_incidents = pd.concat(dfs, ignore_index=True).drop_duplicates()

Found CSV files: ['../data/raw/Crime_Incidents_in_2019.csv', '../data/raw/Crime_Incidents_in_2020.csv', '../data/raw/Crime_Incidents_in_2021.csv', '../data/raw/Crime_Incidents_in_2022.csv', '../data/raw/Crime_Incidents_in_2023.csv', '../data/raw/Crime_Incidents_in_2024.csv', '../data/raw/Crime_Incidents_in_2025.csv']


In [70]:
# Standardize column names in crime_incidents DataFrame (uppercase, underscores)
crime_incidents.columns = [col.strip().upper().replace(' ', '_') for col in crime_incidents.columns]

# Rename REPORT_DAT as REPORT_DATE
crime_incidents = crime_incidents.rename(columns={'REPORT_DAT': 'REPORT_DATE'})

# Reformat REPORT_DATE to datetime
crime_incidents['REPORT_DATE'] = pd.to_datetime(crime_incidents['REPORT_DATE'], errors='coerce')
# Also START_DATE and END_DATE
crime_incidents['START_DATE'] = pd.to_datetime(crime_incidents['START_DATE'], errors='coerce')
crime_incidents['END_DATE'] = pd.to_datetime(crime_incidents['END_DATE'], errors='coerce')

# Add a new column called YEAR drawn from REPORT_DATE without any numbers after decimal
crime_incidents['YEAR'] = crime_incidents['REPORT_DATE'].dt.year.astype(str)

# Show standardized column names
crime_incidents.columns.tolist()

['X',
 'Y',
 'CCN',
 'REPORT_DATE',
 'SHIFT',
 'METHOD',
 'OFFENSE',
 'BLOCK',
 'XBLOCK',
 'YBLOCK',
 'WARD',
 'ANC',
 'DISTRICT',
 'PSA',
 'NEIGHBORHOOD_CLUSTER',
 'BLOCK_GROUP',
 'CENSUS_TRACT',
 'VOTING_PRECINCT',
 'LATITUDE',
 'LONGITUDE',
 'BID',
 'START_DATE',
 'END_DATE',
 'OBJECTID',
 'OCTO_RECORD_ID',
 'YEAR']

In [71]:
# View the first few rows of the combined and deduplicated crime incidents dataset
crime_incidents.head()

Unnamed: 0,X,Y,CCN,REPORT_DATE,SHIFT,METHOD,OFFENSE,BLOCK,XBLOCK,YBLOCK,...,CENSUS_TRACT,VOTING_PRECINCT,LATITUDE,LONGITUDE,BID,START_DATE,END_DATE,OBJECTID,OCTO_RECORD_ID,YEAR
0,400134.0,130213.0,19133977,2019-07-31 02:06:18+00:00,EVENING,GUN,ASSAULT W/DANGEROUS WEAPON,3301 - 3699 BLOCK OF 6TH STREET SE,400134.0,130213.0,...,9804.0,Precinct 122,38.839713,-76.998457,,2019-07-30 22:42:17+00:00,2019-07-30 22:55:26+00:00,717509953,,2019
1,397228.0,137798.0,19046953,2019-03-20 16:35:01+00:00,DAY,OTHERS,THEFT/OTHER,1300 - 1399 BLOCK OF 14TH STREET NW,397228.0,137798.0,...,5203.0,Precinct 17,38.908037,-77.03196,,2019-03-20 15:30:24+00:00,2019-03-20 15:45:03+00:00,717509954,,2019
2,397705.17,141966.76,19051853,2019-03-28 02:08:24+00:00,EVENING,OTHERS,THEFT/OTHER,4500 - 4599 BLOCK OF GEORGIA AVENUE NW,397705.17,141966.76,...,2400.0,Precinct 46,38.945592,-77.026472,,2019-03-27 18:57:00+00:00,2019-03-28 02:17:53+00:00,717509955,,2019
3,402152.0,140253.0,19206304,2019-11-18 10:39:13+00:00,MIDNIGHT,OTHERS,THEFT F/AUTO,2000 - 2199 BLOCK OF JACKSON STREET NE,402152.0,140253.0,...,9400.0,Precinct 70,38.930155,-76.975181,,2019-11-17 19:00:24+00:00,2019-11-18 07:45:26+00:00,717509956,,2019
4,396307.07,137186.46,19079495,2019-05-09 12:21:18+00:00,DAY,OTHERS,THEFT/OTHER,1800 - 1899 BLOCK OF K STREET NW,396307.07,137186.46,...,10700.0,Precinct 17,38.902525,-77.042574,GOLDEN TRIANGLE,2019-05-08 22:30:40+00:00,2019-05-09 10:45:37+00:00,717509960,,2019


In [72]:
# output as a csv
crime_incidents.to_csv(os.path.join(data_folder, 'crime_incidents.csv'), index=False)

In [73]:
# list every value variant in the offense field
offense_variants = crime_incidents['OFFENSE'].unique()
offense_variants

array(['ASSAULT W/DANGEROUS WEAPON', 'THEFT/OTHER', 'THEFT F/AUTO',
       'ROBBERY', 'MOTOR VEHICLE THEFT', 'SEX ABUSE', 'HOMICIDE',
       'BURGLARY', 'ARSON'], dtype=object)

In [74]:
# Count incidents grouped by offense and year, with years as columns
if 'YEAR' not in crime_incidents.columns or 'OFFENSE' not in crime_incidents.columns:
    raise ValueError("Missing 'YEAR' or 'OFFENSE' column in crime_incidents DataFrame.")
crime_offense_yearly = crime_incidents.groupby(['OFFENSE', 'YEAR']).size().unstack(fill_value=0)
crime_offense_yearly

YEAR,2019,2020,2021,2022,2023,2024,2025
OFFENSE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ARSON,8,13,4,4,11,4,4
ASSAULT W/DANGEROUS WEAPON,1573,1632,1658,1379,1404,1025,553
BURGLARY,1274,1443,1174,1049,1092,1005,459
HOMICIDE,166,198,222,203,274,187,101
MOTOR VEHICLE THEFT,2187,3260,3479,3742,6791,5128,2979
ROBBERY,2235,1998,2038,2074,3463,2110,917
SEX ABUSE,199,178,189,182,192,142,57
THEFT F/AUTO,10741,8285,8655,7762,7750,6679,3840
THEFT/OTHER,15572,10929,10892,10765,13267,13001,7330


In [75]:
# Create a comparable 'year-to-date' pivot table for each year (2019-2025) using the min/max date in 2025 (month/day only)
max_2025_date = crime_incidents.loc[crime_incidents['YEAR'] == '2025', 'REPORT_DATE'].max()
min_2025_date = crime_incidents.loc[crime_incidents['YEAR'] == '2025', 'REPORT_DATE'].min()
if pd.isnull(max_2025_date) or pd.isnull(min_2025_date):
    raise ValueError("No valid dates found for 2025 in the dataset.")
# Get month/day for min and max dates
min_md = (min_2025_date.month, min_2025_date.day)
max_md = (max_2025_date.month, max_2025_date.day)
# Filter for incidents in each year between min_md and max_md (month/day only)
def is_within_period(row):
    if pd.isnull(row['REPORT_DATE']):
        return False
    md = (row['REPORT_DATE'].month, row['REPORT_DATE'].day)
    return (md >= min_md) and (md <= max_md)
filtered = crime_incidents[crime_incidents.apply(is_within_period, axis=1)]
# Only keep years 2019-2025
filtered = filtered[filtered['YEAR'].isin([str(y) for y in range(2019, 2026)])]
# Pivot: offenses as rows, years as columns
crime_offense_ytd = filtered.groupby(['OFFENSE', 'YEAR']).size().unstack(fill_value=0)

# Print the date range used for each year
for year in range(2019, 2026):
    print(f"Year {year}: {year}-{min_md[0]:02d}-{min_md[1]:02d} to {year}-{max_md[0]:02d}-{max_md[1]:02d}")

crime_offense_ytd.head()

Year 2019: 2019-01-01 to 2019-08-19
Year 2020: 2020-01-01 to 2020-08-19
Year 2021: 2021-01-01 to 2021-08-19
Year 2022: 2022-01-01 to 2022-08-19
Year 2023: 2023-01-01 to 2023-08-19
Year 2024: 2024-01-01 to 2024-08-19
Year 2025: 2025-01-01 to 2025-08-19


YEAR,2019,2020,2021,2022,2023,2024,2025
OFFENSE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ARSON,7,11,4,3,8,3,4
ASSAULT W/DANGEROUS WEAPON,1005,1038,990,892,922,682,553
BURGLARY,815,1020,662,680,696,580,459
HOMICIDE,106,118,127,131,170,114,101
MOTOR VEHICLE THEFT,1316,1793,2097,2157,4620,3047,2979


In [76]:
# Create a comparable 'year-to-date' pivot table for each year (2019-2025) using the min/max date in 2025 (month/day only)
max_2025_date = crime_incidents.loc[crime_incidents['YEAR'] == '2025', 'REPORT_DATE'].max()
min_2025_date = crime_incidents.loc[crime_incidents['YEAR'] == '2025', 'REPORT_DATE'].min()
if pd.isnull(max_2025_date) or pd.isnull(min_2025_date):
    raise ValueError("No valid dates found for 2025 in the dataset.")
# Get month/day for min and max dates
min_md = (min_2025_date.month, min_2025_date.day)
max_md = (max_2025_date.month, max_2025_date.day)
# Filter for incidents in each year between min_md and max_md (month/day only)
def is_within_period(row):
    if pd.isnull(row['REPORT_DATE']):
        return False
    md = (row['REPORT_DATE'].month, row['REPORT_DATE'].day)
    return (md >= min_md) and (md <= max_md)
filtered = crime_incidents[crime_incidents.apply(is_within_period, axis=1)]
# Only keep years 2019-2025
filtered = filtered[filtered['YEAR'].isin([str(y) for y in range(2019, 2026)])]
# Pivot: offenses as rows, years as columns
crime_offense_ytd = filtered.groupby(['OFFENSE', 'YEAR']).size().unstack(fill_value=0)

# Print the date range used for each year
for year in range(2019, 2026):
    print(f"Year {year}: {year}-{min_md[0]:02d}-{min_md[1]:02d} to {year}-{max_md[0]:02d}-{max_md[1]:02d}")

crime_offense_ytd.head()

Year 2019: 2019-01-01 to 2019-08-19
Year 2020: 2020-01-01 to 2020-08-19
Year 2021: 2021-01-01 to 2021-08-19
Year 2022: 2022-01-01 to 2022-08-19
Year 2023: 2023-01-01 to 2023-08-19
Year 2024: 2024-01-01 to 2024-08-19
Year 2025: 2025-01-01 to 2025-08-19


YEAR,2019,2020,2021,2022,2023,2024,2025
OFFENSE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ARSON,7,11,4,3,8,3,4
ASSAULT W/DANGEROUS WEAPON,1005,1038,990,892,922,682,553
BURGLARY,815,1020,662,680,696,580,459
HOMICIDE,106,118,127,131,170,114,101
MOTOR VEHICLE THEFT,1316,1793,2097,2157,4620,3047,2979


In [77]:
# Repeat the above but use August 11 as the low date
max_2025_date = crime_incidents.loc[crime_incidents['YEAR'] == '2025', 'REPORT_DATE'].max()
min_2025_date = pd.Timestamp(year=2025, month=8, day=11)
if pd.isnull(max_2025_date) or pd.isnull(min_2025_date):
    raise ValueError("No valid dates found for 2025 in the dataset.")
# Get month/day for min and max dates
min_md = (min_2025_date.month, min_2025_date.day)
max_md = (max_2025_date.month, max_2025_date.day)
# Filter for incidents in each year between min_md and max_md (month/day only)
def is_within_period(row):
    if pd.isnull(row['REPORT_DATE']):
        return False
    md = (row['REPORT_DATE'].month, row['REPORT_DATE'].day)
    return (md >= min_md) and (md <= max_md)
filtered = crime_incidents[crime_incidents.apply(is_within_period, axis=1)]
# Only keep years 2019-2025
filtered = filtered[filtered['YEAR'].isin([str(y) for y in range(2019, 2026)])]
# Pivot: offenses as rows, years as columns
crime_offense_since_aug11 = filtered.groupby(['OFFENSE', 'YEAR']).size().unstack(fill_value=0)

# Print the date range used for each year
for year in range(2019, 2026):
    print(f"Year {year}: {year}-{min_md[0]:02d}-{min_md[1]:02d} to {year}-{max_md[0]:02d}-{max_md[1]:02d}")

crime_offense_since_aug11.head()

Year 2019: 2019-08-11 to 2019-08-19
Year 2020: 2020-08-11 to 2020-08-19
Year 2021: 2021-08-11 to 2021-08-19
Year 2022: 2022-08-11 to 2022-08-19
Year 2023: 2023-08-11 to 2023-08-19
Year 2024: 2024-08-11 to 2024-08-19
Year 2025: 2025-08-11 to 2025-08-19


YEAR,2019,2020,2021,2022,2023,2024,2025
OFFENSE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ARSON,0,0,0,1,0,0,0
ASSAULT W/DANGEROUS WEAPON,38,29,39,34,33,16,17
BURGLARY,36,33,36,24,29,36,21
HOMICIDE,8,3,9,2,7,2,2
MOTOR VEHICLE THEFT,64,86,75,72,179,145,77


In [78]:
# Create a comprehensive DataFrame showing crime counts by offense type, ward, and year
crime_by_ward_year_offense = crime_incidents.groupby(['WARD', 'YEAR', 'OFFENSE']).size().reset_index(name='COUNT')

# Create separate DataFrames for each offense type by ward and year
offense_types = crime_incidents['OFFENSE'].unique()
offense_dataframes = {}

print(f"Creating DataFrames for {len(offense_types)} offense types...")

for offense in offense_types:
    # Filter data for this specific offense
    offense_data = crime_incidents[crime_incidents['OFFENSE'] == offense]
    
    # Create pivot table: wards as rows, years as columns
    offense_pivot = offense_data.groupby(['WARD', 'YEAR']).size().unstack(fill_value=0)
    
    # Convert ward numbers to integers for cleaner display (removes decimal points)
    offense_pivot.index = offense_pivot.index.astype(int)
    
    # Store in dictionary
    offense_dataframes[offense] = offense_pivot
    
    # Save each offense type to a separate CSV
    safe_filename = offense.lower().replace('/', '_').replace(' ', '_').replace('w/', 'with_')
    filename = f"crime_{safe_filename}_by_ward_year.csv"
    filepath = os.path.join(data_folder, '../processed/', filename)
    offense_pivot.to_csv(filepath)
    print(f"Saved {offense} data to: {filename}")

# Save the comprehensive dataset
comprehensive_file = os.path.join(data_folder, '../processed/crime_by_ward_year_offense_comprehensive.csv')
crime_by_ward_year_offense.to_csv(comprehensive_file, index=False)

print(f"\nFiles created:")
print(f"- Comprehensive data: crime_by_ward_year_offense_comprehensive.csv")
print(f"- Individual offense files: {len(offense_types)} CSV files")
print(f"\nTotal offense types: {len(offense_types)}")
print(f"Available offense types: {sorted(offense_types)}")

# Display sample of comprehensive data
print(f"\nComprehensive DataFrame shape: {crime_by_ward_year_offense.shape}")
crime_by_ward_year_offense.head(15)

Creating DataFrames for 9 offense types...
Saved ASSAULT W/DANGEROUS WEAPON data to: crime_assault_w_dangerous_weapon_by_ward_year.csv
Saved THEFT/OTHER data to: crime_theft_other_by_ward_year.csv
Saved THEFT F/AUTO data to: crime_theft_f_auto_by_ward_year.csv
Saved ROBBERY data to: crime_robbery_by_ward_year.csv
Saved MOTOR VEHICLE THEFT data to: crime_motor_vehicle_theft_by_ward_year.csv
Saved SEX ABUSE data to: crime_sex_abuse_by_ward_year.csv
Saved HOMICIDE data to: crime_homicide_by_ward_year.csv
Saved BURGLARY data to: crime_burglary_by_ward_year.csv
Saved ARSON data to: crime_arson_by_ward_year.csv

Files created:
- Comprehensive data: crime_by_ward_year_offense_comprehensive.csv
- Individual offense files: 9 CSV files

Total offense types: 9
Available offense types: ['ARSON', 'ASSAULT W/DANGEROUS WEAPON', 'BURGLARY', 'HOMICIDE', 'MOTOR VEHICLE THEFT', 'ROBBERY', 'SEX ABUSE', 'THEFT F/AUTO', 'THEFT/OTHER']

Comprehensive DataFrame shape: (477, 4)


Unnamed: 0,WARD,YEAR,OFFENSE,COUNT
0,1.0,2019,ARSON,1
1,1.0,2019,ASSAULT W/DANGEROUS WEAPON,163
2,1.0,2019,BURGLARY,135
3,1.0,2019,HOMICIDE,16
4,1.0,2019,MOTOR VEHICLE THEFT,248
5,1.0,2019,ROBBERY,407
6,1.0,2019,SEX ABUSE,17
7,1.0,2019,THEFT F/AUTO,1690
8,1.0,2019,THEFT/OTHER,2148
9,1.0,2020,ARSON,1


In [79]:
# Create detailed homicides analysis by ward and year
homicides = crime_incidents[crime_incidents['OFFENSE'] == 'HOMICIDE']

# Create pivot table: wards as rows, years as columns
homicides_by_ward_year = homicides.groupby(['WARD', 'YEAR']).size().unstack(fill_value=0)

# Convert ward numbers to integers for cleaner display (removes decimal points)
homicides_by_ward_year.index = homicides_by_ward_year.index.astype(int)

# Add a total column (sum across all years for each ward)
homicides_by_ward_year['TOTAL_ALL_YEARS'] = homicides_by_ward_year.sum(axis=1)

# Add a total row (sum across all wards for each year)
totals_row = homicides_by_ward_year.sum(axis=0)
totals_row.name = 'TOTAL_ALL_WARDS'

# Combine the data with totals
homicides_with_totals = pd.concat([homicides_by_ward_year, totals_row.to_frame().T])

# Save to CSV
homicides_file = os.path.join(data_folder, '../processed/homicides_by_ward_year_with_totals.csv')
homicides_with_totals.to_csv(homicides_file)

print(f"Homicides data saved to: homicides_by_ward_year_with_totals.csv")
print(f"Data includes:")
print(f"- Rows: {len(homicides_by_ward_year)} wards + 1 total row")
print(f"- Columns: Years 2019-2025 + Total column")
print(f"- Total homicides across all years and wards: {homicides_with_totals.loc['TOTAL_ALL_WARDS', 'TOTAL_ALL_YEARS']}")

# Display the data
print(f"\nHomicides by Ward and Year:")
homicides_with_totals

Homicides data saved to: homicides_by_ward_year_with_totals.csv
Data includes:
- Rows: 8 wards + 1 total row
- Columns: Years 2019-2025 + Total column
- Total homicides across all years and wards: 1351

Homicides by Ward and Year:


YEAR,2019,2020,2021,2022,2023,2024,2025,TOTAL_ALL_YEARS
1,16,14,13,14,28,15,7,107
2,0,9,7,11,11,11,4,53
3,3,3,2,2,2,2,2,16
4,6,9,20,9,16,15,3,78
5,18,25,25,33,36,23,12,172
6,12,19,11,9,26,11,12,100
7,46,57,62,47,56,45,23,336
8,65,62,82,78,99,65,38,489
TOTAL_ALL_WARDS,166,198,222,203,274,187,101,1351


In [80]:
# Create separate CSV files for specific crime types with all incident details

# Define crime filters
crime_filters = {
    'all_homicides': crime_incidents[crime_incidents['OFFENSE'] == 'HOMICIDE'],
    'all_assaults': crime_incidents[crime_incidents['OFFENSE'] == 'ASSAULT W/DANGEROUS WEAPON'],
    'all_burglaries': crime_incidents[crime_incidents['OFFENSE'] == 'BURGLARY'],
    'all_robberies': crime_incidents[crime_incidents['OFFENSE'] == 'ROBBERY'],
    'all_car_thefts': crime_incidents[crime_incidents['OFFENSE'] == 'MOTOR VEHICLE THEFT'],
    'all_gun_crimes': crime_incidents[crime_incidents['METHOD'].str.contains('GUN', case=False, na=False)]
}

print("Creating detailed CSV files for specific crime types...")
print("=" * 60)

# Create and save each CSV file
for filename, filtered_data in crime_filters.items():
    # Create file path
    filepath = os.path.join(data_folder, '../processed/', f'{filename}.csv')
    
    # Save the filtered data with all columns
    filtered_data.to_csv(filepath, index=False)
    
    # Print summary information
    print(f"\n{filename.replace('_', ' ').title()}:")
    print(f"  - File: {filename}.csv")
    print(f"  - Records: {len(filtered_data):,}")
    print(f"  - Columns: {len(filtered_data.columns)}")
    
    # Show date range if data exists
    if len(filtered_data) > 0:
        date_range = filtered_data['REPORT_DATE'].dt.date
        min_date = date_range.min()
        max_date = date_range.max()
        print(f"  - Date range: {min_date} to {max_date}")
        
        # Show coordinate availability
        coords_available = len(filtered_data.dropna(subset=['LATITUDE', 'LONGITUDE']))
        print(f"  - Records with coordinates: {coords_available:,} ({coords_available/len(filtered_data)*100:.1f}%)")
        
        # Show ward distribution
        ward_counts = filtered_data['WARD'].value_counts().sort_index()
        print(f"  - Ward distribution: {dict(ward_counts)}")

print(f"\n" + "=" * 60)
print("All crime-specific CSV files created successfully!")
print(f"Files saved to: {os.path.join(data_folder, '../processed/')}")

# Summary of all files
total_records = sum(len(data) for data in crime_filters.values())
print(f"\nTotal records across all crime-specific files: {total_records:,}")
print(f"Note: Gun crimes count may overlap with other categories as it's filtered by method, not offense.")

Creating detailed CSV files for specific crime types...

All Homicides:
  - File: all_homicides.csv
  - Records: 1,351
  - Columns: 26
  - Date range: 2019-01-01 to 2025-08-13
  - Records with coordinates: 1,351 (100.0%)
  - Ward distribution: {1.0: np.int64(107), 2.0: np.int64(53), 3.0: np.int64(16), 4.0: np.int64(78), 5.0: np.int64(172), 6.0: np.int64(100), 7.0: np.int64(336), 8.0: np.int64(489)}

All Homicides:
  - File: all_homicides.csv
  - Records: 1,351
  - Columns: 26
  - Date range: 2019-01-01 to 2025-08-13
  - Records with coordinates: 1,351 (100.0%)
  - Ward distribution: {1.0: np.int64(107), 2.0: np.int64(53), 3.0: np.int64(16), 4.0: np.int64(78), 5.0: np.int64(172), 6.0: np.int64(100), 7.0: np.int64(336), 8.0: np.int64(489)}

All Assaults:
  - File: all_assaults.csv
  - Records: 9,224
  - Columns: 26
  - Date range: 2019-01-01 to 2025-08-19
  - Records with coordinates: 9,224 (100.0%)
  - Ward distribution: {1.0: np.int64(846), 2.0: np.int64(663), 3.0: np.int64(157), 4.0: 

In [None]:
import pandas as pd
import folium
import numpy as np

# Load the crime data
# df = pd.read_csv('../data/raw/Crime_Incidents_in_2025.csv')

df = crime_incidents[crime_incidents['YEAR'].isin(['2024', '2025'])]

# Clean the data and remove rows with missing coordinates
df = df.dropna(subset=['LATITUDE', 'LONGITUDE'])

# Define the map bounds for consistent scale across all maps
lat_min, lat_max = df['LATITUDE'].min(), df['LATITUDE'].max()
lon_min, lon_max = df['LONGITUDE'].min(), df['LONGITUDE'].max()
center_lat = df['LATITUDE'].mean()
center_lon = df['LONGITUDE'].mean()

# Jitter function to avoid overlapping points
def add_jitter(coordinates, jitter_amount=0.0001):
    """Add small random jitter to coordinates to separate overlapping points"""
    jitter_lat = np.random.normal(0, jitter_amount, len(coordinates))
    jitter_lon = np.random.normal(0, jitter_amount, len(coordinates))
    return coordinates + np.column_stack([jitter_lat, jitter_lon])

def create_crime_map(crime_data, title, filename, color='red'):
    """Create a folium map for specific crime type"""
    
    # Create base map with CartoDB Positron tiles
    m = folium.Map(
        location=[center_lat, center_lon],
        zoom_start=11,
        tiles='CartoDB positron'
    )
    
    # Set consistent bounds for all maps
    m.fit_bounds([[lat_min, lon_min], [lat_max, lon_max]])
    
    # Add jitter to coordinates
    coords = crime_data[['LATITUDE', 'LONGITUDE']].values
    if len(coords) > 0:
        jittered_coords = add_jitter(coords)

        # Add points with 60% opacity, no popups
        for i, (lat, lon) in enumerate(jittered_coords):
            folium.CircleMarker(
                location=[lat, lon],
                radius=2,  # Small radius for point dots
                color=color,
                weight=1,
                fillColor=color,
                fillOpacity=0.6,  # 60% opacity as recommended
                opacity=0.8
            ).add_to(m)
    
    # Add title with comma formatting for thousands
    incident_count = f"{len(crime_data):,}"
    title_html = f'''
                 <h3 align="center" style="font-size:18px"><b>{title}</b></h3>
                 <p align="center" style="font-size:12px">Total incidents: {incident_count}</p>
                 '''
    m.get_root().html.add_child(folium.Element(title_html))
    
    # Save map to docs folder
    m.save(f"../docs/{filename}")
    return m

# 1. Homicide map
homicide_data = df[df['OFFENSE'] == 'HOMICIDE']
homicide_map = create_crime_map(homicide_data, 'Homicides', 'homicide_map.html', 'darkred')

# 2. Assault with deadly weapon map  
assault_data = df[df['OFFENSE'] == 'ASSAULT W/DANGEROUS WEAPON']
assault_map = create_crime_map(assault_data, 'Assault with Dangerous Weapon', 'assault_map.html', 'red')

# 3. All gun crimes map (using METHOD field)
gun_crimes = df[df['METHOD'] == 'GUN']
gun_map = create_crime_map(gun_crimes, 'Gun Crimes', 'gun_crimes_map.html', 'orange')

# 4. Robbery map
robbery_data = df[df['OFFENSE'] == 'ROBBERY']
robbery_map = create_crime_map(robbery_data, 'Robberies', 'robbery_map.html', 'purple')

# 5. Burglary map
burglary_data = df[df['OFFENSE'] == 'BURGLARY']
burglary_map = create_crime_map(burglary_data, 'Burglaries', 'burglary_map.html', 'blue')

# 6. Car theft map (Motor Vehicle Theft)
car_theft_data = df[df['OFFENSE'] == 'MOTOR VEHICLE THEFT']
car_theft_map = create_crime_map(car_theft_data, 'Motor Vehicle Theft', 'car_theft_map.html', 'green')

# Print summary statistics
print("Crime Type Summary:")
print(f"Homicides: {len(homicide_data):,} incidents")
print(f"Assault with Dangerous Weapon: {len(assault_data):,} incidents") 
print(f"Gun Crimes: {len(gun_crimes):,} incidents")
print(f"Robberies: {len(robbery_data):,} incidents")
print(f"Burglaries: {len(burglary_data):,} incidents")
print(f"Motor Vehicle Theft: {len(car_theft_data):,} incidents")
print(f"\nMaps saved as HTML files in docs folder")

TypeError: Object of type function is not JSON serializable

In [None]:
# Create a combined map with an additional GeoJSON layer for federal troops and law enforcement locations

def create_combined_map_with_geojson():
    """Create a map showing all crime types with different colors and a GeoJSON layer for federal troops/law enforcement locations"""
    m = folium.Map(
        location=[center_lat, center_lon],
        zoom_start=11,
        tiles='CartoDB positron'
    )
    m.fit_bounds([[lat_min, lon_min], [lat_max, lon_max]])

    # Add crime type points (same as before)
    crime_types = [
        (homicide_data, 'Homicide', 'darkred'),
        (assault_data, 'Assault w/Dangerous Weapon', 'red'),
        (gun_crimes, 'Gun Crimes', 'orange'),
        (robbery_data, 'Robbery', 'purple'),
        (burglary_data, 'Burglary', 'blue'),
        (car_theft_data, 'Motor Vehicle Theft', 'green')
    ]
    for crime_data, crime_name, color in crime_types:
        if len(crime_data) > 0:
            coords = crime_data[['LATITUDE', 'LONGITUDE']].values
            jittered_coords = add_jitter(coords)
            for i, (lat, lon) in enumerate(jittered_coords):
                folium.CircleMarker(
                    location=[lat, lon],
                    radius=2,
                    color=color,
                    weight=1,
                    fillColor=color,
                    fillOpacity=0.7,
                    opacity=0.7
                ).add_to(m)

    # Add legend
    legend_html = '''
    <div style="position: fixed; 
                bottom: 50px; left: 50px; width: 200px; height: 150px; 
                background-color: white; border:2px solid grey; z-index:9999; 
                font-size:18px; padding: 10px
                ">
    <p><b>Crime Types</b></p>
    <p><i class="fa fa-circle" style="color:darkred"></i> Homicide</p>
    <p><i class="fa fa-circle" style="color:red"></i> Assault w/Weapon</p>
    <p><i class="fa fa-circle" style="color:orange"></i> Gun Crimes</p>
    <p><i class="fa fa-circle" style="color:purple"></i> Robbery</p>
    <p><i class="fa fa-circle" style="color:blue"></i> Burglary</p>
    <p><i class="fa fa-circle" style="color:green"></i> Vehicle Theft</p>
    <p><i class="fa fa-circle" style="color:black"></i> Federal Troops/Law Enforcement</p>
    </div>
    '''
    m.get_root().html.add_child(folium.Element(legend_html))

    # Add GeoJSON layer
    geojson_path = '../data/raw/cbs-verified-locations-of-federal-troops-and-law-enforcement-in-dc.geojson'
    folium.GeoJson(
        geojson_path,
        name='Federal Troops/Law Enforcement',
        style_function=lambda x: {'color': 'black', 'weight': 3, 'fillOpacity': 0.5}
    ).add_to(m)

    folium.LayerControl().add_to(m)
    m.save('../docs/all_crimes_combined_map_with_geojson.html')
    return m

# Create combined map with GeoJSON layer
combined_map_with_geojson = create_combined_map_with_geojson()
print("Combined map with GeoJSON saved as '../docs/all_crimes_combined_map_with_geojson.html'")


Combined map with GeoJSON saved as '../docs/all_crimes_combined_map_with_geojson.html'
