# Crime rate per SAL region

This notebook will aim to first join the crime data with SAL data.

Then determine crime rate (number of crime instance per population) within each SAL/suburb region 

In [1]:
import pandas as pd
import geopandas as gpd

In [2]:
# Read crime count data 
crime_count_gpd = gpd.read_file('../data/raw/crime_count_with_point_geo.geojson')  # Specify the driver based on the desired output format
# Read SA2 gdf
SAL_gdf = gpd.read_file("../data/raw/victoria_region_gdf/SAL_region_gdf.geojson")
# remove null
SAL_gdf = SAL_gdf.dropna()

### Join the crime data and SAL by geometry join

In [3]:
# Use sjoin to find out which suburb each point belongs to
crime_count_gpd = gpd.sjoin(crime_count_gpd, SAL_gdf, how="left", op="within")

  if (await self.run_code(code, result,  async_=asy)):
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: EPSG:7844

  crime_count_gpd = gpd.sjoin(crime_count_gpd, SAL_gdf, how="left", op="within")


In [4]:
crime_count_gpd

Unnamed: 0,Suburb/Town Name,Postcode,Year,Incidents Recorded,geometry,index_right,SAL_NAME21,SAL_CODE21,SHAPE_Area
0,abbeyard,3737,2015,4,POINT (146.76407 -37.02132),0.0,Abbeyard,20001,0.033162
1,abbeyard,3737,2017,1,POINT (146.76407 -37.02132),0.0,Abbeyard,20001,0.033162
2,abbeyard,3737,2019,3,POINT (146.76407 -37.02132),0.0,Abbeyard,20001,0.033162
3,abbeyard,3737,2021,1,POINT (146.76407 -37.02132),0.0,Abbeyard,20001,0.033162
4,abbotsford,3067,2014,793,POINT (144.99836 -37.80236),1.0,Abbotsford,20002,0.000178
...,...,...,...,...,...,...,...,...,...
23023,zeerust,3634,2022,2,POINT (145.39733 -36.27020),2942.0,Zeerust,22943,0.001808
23024,zeerust,3634,2023,8,POINT (145.39733 -36.27020),2942.0,Zeerust,22943,0.001808
23025,zumsteins,3401,2018,2,POINT (142.38424 -37.11879),2943.0,Zumsteins,22944,0.026011
23026,zumsteins,3401,2019,2,POINT (142.38424 -37.11879),2943.0,Zumsteins,22944,0.026011


### Feature selection

In [5]:
crime_count_gpd = crime_count_gpd[['SAL_NAME21', 'SAL_CODE21', 'Year', 'Incidents Recorded']]

### Rename the incident recorded to number of crime incident
crime_count_gpd = crime_count_gpd.rename(columns={'Incidents Recorded': 'Crime_incidents_count'})

crime_count_gpd

Unnamed: 0,SAL_NAME21,SAL_CODE21,Year,Crime_incidents_count
0,Abbeyard,20001,2015,4
1,Abbeyard,20001,2017,1
2,Abbeyard,20001,2019,3
3,Abbeyard,20001,2021,1
4,Abbotsford,20002,2014,793
...,...,...,...,...
23023,Zeerust,22943,2022,2
23024,Zeerust,22943,2023,8
23025,Zumsteins,22944,2018,2
23026,Zumsteins,22944,2019,2


#### Check if one location/suburb have been mapped to same SAL region 

In [6]:
## Check if one location/suburb have been mapped to same SAL region 
## We temp as a temporary df
temp = crime_count_gpd[['SAL_NAME21', 'SAL_CODE21', 'Year']]

duplicates = temp[temp.duplicated()]
duplicates

Unnamed: 0,SAL_NAME21,SAL_CODE21,Year
967,Balnarring,20120,2014
968,Balnarring,20120,2015
969,Balnarring,20120,2016
970,Balnarring,20120,2017
971,Balnarring,20120,2018
...,...,...,...
22914,Alvie,20038,2017
22915,Alvie,20038,2018
22916,Alvie,20038,2019
22917,Alvie,20038,2020


We aware this duplication could due to the geographical scale and region for SAL, which could result single SAL region contain one or more locality in the crime data, hence we will group by SAL_NAME21	SAL_CODE21	Year and sum up the Crime_incidents_count

In [7]:
crime_count_gpd

Unnamed: 0,SAL_NAME21,SAL_CODE21,Year,Crime_incidents_count
0,Abbeyard,20001,2015,4
1,Abbeyard,20001,2017,1
2,Abbeyard,20001,2019,3
3,Abbeyard,20001,2021,1
4,Abbotsford,20002,2014,793
...,...,...,...,...
23023,Zeerust,22943,2022,2
23024,Zeerust,22943,2023,8
23025,Zumsteins,22944,2018,2
23026,Zumsteins,22944,2019,2


In [8]:
# Group by 'SAL_NAME21', 'SAL_CODE21', and 'Year' and sum 'Crime_incidents_count'
crime_count_gpd = crime_count_gpd.groupby(['SAL_NAME21', 'SAL_CODE21', 'Year'])['Crime_incidents_count'].sum().reset_index()

# Rename the 'Crime_incidents_count' column to 'total_crime_incidents'
crime_count_gpd = crime_count_gpd.rename(columns={'Crime_incidents_count': 'total_crime_incidents'})


In [9]:
# save
crime_count_gpd.to_csv('../data/curated/crime_count_SAL.csv',index=False)

crime_count_gpd

Unnamed: 0,SAL_NAME21,SAL_CODE21,Year,total_crime_incidents
0,Abbeyard,20001,2015,4
1,Abbeyard,20001,2017,1
2,Abbeyard,20001,2019,3
3,Abbeyard,20001,2021,1
4,Abbotsford,20002,2014,793
...,...,...,...,...
22729,Zeerust,22943,2022,2
22730,Zeerust,22943,2023,8
22731,Zumsteins,22944,2018,2
22732,Zumsteins,22944,2019,2


# Select 2023 year

In [10]:
# Select the releveant year 2023
crime_2023 = crime_count_gpd[crime_count_gpd['Year']==2023]


In [11]:
crime_2023['SAL_CODE21'] = crime_2023['SAL_CODE21'].astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crime_2023['SAL_CODE21'] = crime_2023['SAL_CODE21'].astype(int)


In [12]:
crime_2023

Unnamed: 0,SAL_NAME21,SAL_CODE21,Year,total_crime_incidents
13,Abbotsford,20002,2023,1097
23,Aberfeldie,20003,2023,79
35,Acheron,20005,2023,1
47,Adams Estate,20007,2023,10
51,Addington,20008,2023,1
...,...,...,...,...
22692,Youarang,22938,2023,1
22701,Yulecart,22939,2023,5
22716,Yuroke,22941,2023,9
22720,Yuulong,22942,2023,2


# Import population data

In [13]:
domain_df = pd.read_csv('../data/curated/final_drop_pcg.csv')
# feature selection
domain_df = domain_df[['SAL_NAME21', 'SAL_CODE', 'Total Suburb Population']]
# find population per sal region
population_sal = domain_df.groupby(['SAL_NAME21', 'SAL_CODE'])['Total Suburb Population'].mean().reset_index()


In [14]:
# convert the value type to int
population_sal['SAL_CODE'] = population_sal['SAL_CODE'].astype(int)


In [15]:
population_sal

Unnamed: 0,SAL_NAME21,SAL_CODE,Total Suburb Population
0,Abbotsford (Vic.),20002,4817.000000
1,Aberfeldie,20003,6463.000000
2,Aintree,20011,5300.000000
3,Aireys Inlet,20013,3100.000000
4,Airport West,20015,3672.000000
...,...,...,...
748,Yarrambat,22916,3291.000000
749,Yarraville,22917,6355.777778
750,Yarrawonga (Vic.),22919,3847.000000
751,Yea,22924,1965.000000


## merge population and crime data for crime rate

Ensure use left join here, as we also want to consider SAL regions that we have population data but not crime data. We will estimate them by victoria's overall crime rate by its population 

In [16]:
merged = population_sal.merge(crime_2023, left_on='SAL_CODE', right_on='SAL_CODE21', how='left')


In [17]:
merged

Unnamed: 0,SAL_NAME21_x,SAL_CODE,Total Suburb Population,SAL_NAME21_y,SAL_CODE21,Year,total_crime_incidents
0,Abbotsford (Vic.),20002,4817.000000,Abbotsford,20002.0,2023.0,1097.0
1,Aberfeldie,20003,6463.000000,Aberfeldie,20003.0,2023.0,79.0
2,Aintree,20011,5300.000000,Aintree,20011.0,2023.0,675.0
3,Aireys Inlet,20013,3100.000000,Aireys Inlet,20013.0,2023.0,26.0
4,Airport West,20015,3672.000000,Airport West,20015.0,2023.0,608.0
...,...,...,...,...,...,...,...
748,Yarrambat,22916,3291.000000,Yarrambat,22916.0,2023.0,18.0
749,Yarraville,22917,6355.777778,Yarraville,22917.0,2023.0,792.0
750,Yarrawonga (Vic.),22919,3847.000000,Yarrawonga,22919.0,2023.0,350.0
751,Yea,22924,1965.000000,Yea,22924.0,2023.0,83.0


In [18]:
# select desire column/features 
merged = merged[['SAL_NAME21_x', 'SAL_CODE', 'Total Suburb Population', 'total_crime_incidents']]


In [19]:
merged 

Unnamed: 0,SAL_NAME21_x,SAL_CODE,Total Suburb Population,total_crime_incidents
0,Abbotsford (Vic.),20002,4817.000000,1097.0
1,Aberfeldie,20003,6463.000000,79.0
2,Aintree,20011,5300.000000,675.0
3,Aireys Inlet,20013,3100.000000,26.0
4,Airport West,20015,3672.000000,608.0
...,...,...,...,...
748,Yarrambat,22916,3291.000000,18.0
749,Yarraville,22917,6355.777778,792.0
750,Yarrawonga (Vic.),22919,3847.000000,350.0
751,Yea,22924,1965.000000,83.0


In [20]:
# check suburb with missing crime data 
merged[merged.isnull().any(axis=1)]

Unnamed: 0,SAL_NAME21_x,SAL_CODE,Total Suburb Population,total_crime_incidents
37,Balnarring Beach,20121,10027.0,
108,Bundalong South,20396,1208.0,
200,Deanside,20724,4244.0,
255,Falls Creek (Vic.),20906,4291.0,
275,Fraser Rise,20950,3364.0,
301,Greta South,21109,4196.0,
332,Hillside (East Gippsland - Vic.),21192,3585.0,
411,Lucas,21550,6287.0,
479,Mount Eccles,21791,4955.0,
513,Noble Park North,21953,2909.0,


## Estimate missing data by victoria's crime rate multiply with the population

According to research currently victoria population is 6.78million 
the total record crime instance is 371,691 from https://www.crimestatistics.vic.gov.au/media-centre/news/key-figures-year-ending-june-2023

crime rate = crime instance/population


we will estimate any missing value of crime instance by vic crime rate *population

In [21]:
# we will estimate any missing value of crime instance by vic crime rate *population 
vic_population = 6780000
crime_rate = 371691/vic_population
crime_rate

0.054821681415929205

In [22]:
# Estimate total_crime_incidents for rows with missing values
merged['total_crime_incidents'].fillna(merged['Total Suburb Population'] * crime_rate, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['total_crime_incidents'].fillna(merged['Total Suburb Population'] * crime_rate, inplace=True)


In [23]:
merged

Unnamed: 0,SAL_NAME21_x,SAL_CODE,Total Suburb Population,total_crime_incidents
0,Abbotsford (Vic.),20002,4817.000000,1097.0
1,Aberfeldie,20003,6463.000000,79.0
2,Aintree,20011,5300.000000,675.0
3,Aireys Inlet,20013,3100.000000,26.0
4,Airport West,20015,3672.000000,608.0
...,...,...,...,...
748,Yarrambat,22916,3291.000000,18.0
749,Yarraville,22917,6355.777778,792.0
750,Yarrawonga (Vic.),22919,3847.000000,350.0
751,Yea,22924,1965.000000,83.0


In [24]:
# check if there is any null
nan_row_count = merged.isna().any(axis=1).sum()

print("Number of rows with NaN values:", nan_row_count)

Number of rows with NaN values: 0


# Find crime rate for each suburb  

In [25]:
# create a new column for crime rate
merged['crime_rate'] = merged['total_crime_incidents'] / merged['Total Suburb Population']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['crime_rate'] = merged['total_crime_incidents'] / merged['Total Suburb Population']


In [26]:
merged

Unnamed: 0,SAL_NAME21_x,SAL_CODE,Total Suburb Population,total_crime_incidents,crime_rate
0,Abbotsford (Vic.),20002,4817.000000,1097.0,0.227735
1,Aberfeldie,20003,6463.000000,79.0,0.012223
2,Aintree,20011,5300.000000,675.0,0.127358
3,Aireys Inlet,20013,3100.000000,26.0,0.008387
4,Airport West,20015,3672.000000,608.0,0.165577
...,...,...,...,...,...
748,Yarrambat,22916,3291.000000,18.0,0.005469
749,Yarraville,22917,6355.777778,792.0,0.124611
750,Yarrawonga (Vic.),22919,3847.000000,350.0,0.090980
751,Yea,22924,1965.000000,83.0,0.042239


In [27]:
# rename the column
merged.rename(columns={'SAL_NAME21_x': 'SAL_NAME21'}, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged.rename(columns={'SAL_NAME21_x': 'SAL_NAME21'}, inplace=True)


In [28]:
merged

Unnamed: 0,SAL_NAME21,SAL_CODE,Total Suburb Population,total_crime_incidents,crime_rate
0,Abbotsford (Vic.),20002,4817.000000,1097.0,0.227735
1,Aberfeldie,20003,6463.000000,79.0,0.012223
2,Aintree,20011,5300.000000,675.0,0.127358
3,Aireys Inlet,20013,3100.000000,26.0,0.008387
4,Airport West,20015,3672.000000,608.0,0.165577
...,...,...,...,...,...
748,Yarrambat,22916,3291.000000,18.0,0.005469
749,Yarraville,22917,6355.777778,792.0,0.124611
750,Yarrawonga (Vic.),22919,3847.000000,350.0,0.090980
751,Yea,22924,1965.000000,83.0,0.042239


However, here we only have 753 data as we only have 753 data of SAL with population. 

Here we will estimate the remainning suburb/SAL region's crime rate by victoria's crime rate in 2023 we obtain previosly 0.054821681415929205

In [29]:
Full_SAL = SAL_gdf[['SAL_NAME21', 'SAL_CODE21']]
# convert the value type to int
Full_SAL['SAL_CODE21'] = Full_SAL['SAL_CODE21'].astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Full_SAL['SAL_CODE21'] = Full_SAL['SAL_CODE21'].astype(int)


In [30]:
# Join the crime data with all SAL region
merged_full = Full_SAL.merge(merged, left_on='SAL_CODE21', right_on='SAL_CODE', how='left')
merged_full = merged_full[['SAL_NAME21_x', 'SAL_CODE21', 'crime_rate']]

In [31]:
merged_full

Unnamed: 0,SAL_NAME21_x,SAL_CODE21,crime_rate
0,Abbeyard,20001,
1,Abbotsford,20002,0.227735
2,Aberfeldie,20003,0.012223
3,Aberfeldy,20004,
4,Acheron,20005,
...,...,...,...
2939,Yundool,22940,
2940,Yuroke,22941,
2941,Yuulong,22942,
2942,Zeerust,22943,


In [32]:
# fill null by estimate opf victoria crime rate 
merged_full['crime_rate'].fillna(crime_rate, inplace=True)

In [33]:
merged_full

Unnamed: 0,SAL_NAME21_x,SAL_CODE21,crime_rate
0,Abbeyard,20001,0.054822
1,Abbotsford,20002,0.227735
2,Aberfeldie,20003,0.012223
3,Aberfeldy,20004,0.054822
4,Acheron,20005,0.054822
...,...,...,...
2939,Yundool,22940,0.054822
2940,Yuroke,22941,0.054822
2941,Yuulong,22942,0.054822
2942,Zeerust,22943,0.054822


In [35]:
# Save to Cureated 
merged_full.to_csv("../data/curated/crime_rate.csv", index=False)
