In [22]:
import pandas as pd
import geopandas as gpd

In [23]:
# File paths
census_tract_shp = 'la_census_tract/LA_City_2020_Census_Tracts_.shp'
crime_csv = 'crime_data/crime_la_2020_2024.csv'

# Generate list of years from 2001 to 2024
years = list(range(2010, 2024))

In [24]:
# Read in the census tract shapefile
tracts_gdf = gpd.read_file(census_tract_shp)
tracts_gdf = tracts_gdf.to_crs(epsg=4326)
tracts_gdf = tracts_gdf[['geometry', 'CT20']]

# Read in the crime points data, this is very large dataset
crime_df = pd.read_csv(crime_csv)

In [25]:
# Convert the date to a datetime object
crime_df['Date Rptd'] = pd.to_datetime(crime_df['Date Rptd'])
crime_df['Year'] = crime_df['Date Rptd'].dt.year

  crime_df['Date Rptd'] = pd.to_datetime(crime_df['Date Rptd'])


In [26]:
# Define funtion to aggreate crime points counts by census tract, by type of crime
def aggregate_points(points_gdf, geometry_gdf):
    # Spatial join the points to the census tracts
    points_gdf = gpd.sjoin(points_gdf, geometry_gdf, how='inner', predicate='within')
    # Group by the census tract and
    points_gdf = points_gdf.groupby(['CT20', 'Crm Cd Desc']).size().reset_index(name='count')
    return points_gdf
    

In [27]:
# Create an empty dataframe to store the combined results
combined_results = pd.DataFrame()

# Loop through each year
for year in years:
    # Filter the crime data for the current year
    crime_df_year = crime_df[crime_df['Year'] == year]
    
    # Convert the filtered crime data to a GeoDataFrame
    crime_gdf_year = gpd.GeoDataFrame(crime_df_year, geometry=gpd.points_from_xy(crime_df_year.LON, crime_df_year.LAT))
    crime_gdf_year = crime_gdf_year.set_crs(epsg=4326)
    
    # Aggregate the crime points by census tract
    aggregated_points = aggregate_points(crime_gdf_year, tracts_gdf)
    
    # Add the year to the aggregated points dataframe
    aggregated_points['Year'] = year
    
    # Append the results to the combined dataframe
    combined_results = pd.concat([combined_results, aggregated_points], ignore_index=True)

In [28]:
# Create a 'geoid10' columns. goeid10 = '06037' + CT20
combined_results['geoid10'] = '06037' + combined_results['CT20'].astype(str)
combined_results = combined_results[['geoid10', 'Crm Cd Desc', 'count', 'Year']]

# Replace the comma in 'Crm Cd Desc' with ' -'
combined_results['Crm Cd Desc'] = combined_results['Crm Cd Desc'].str.replace(',', '|')

# Save the combined results to a CSV file
combined_results.to_csv('crime_data/crime_la_2020_2024_by_tract_type.csv', index=False)

# Randomly sample 10,000 rows from the combined results, and display the first 5 rows.
combined_results.sample(10000).head()

Unnamed: 0,geoid10,Crm Cd Desc,count,Year
92844,6037212304,VEHICLE - STOLEN,12,2022
19577,6037211121,CRIMINAL THREATS - NO WEAPON DISPLAYED,7,2020
31764,6037273100,THEFT OF IDENTITY,4,2020
67912,6037273800,THEFT OF IDENTITY,6,2021
113979,6037123203,CONTEMPT OF COURT,3,2023
