In [1]:
# load dependancies
import pandas as pd

In [6]:
# read in csv to pandas dataframe
raw_data = pd.read_csv('2022-23_data_sa_crime.csv')

In [48]:
df = raw_data.copy()

# drop columns that are not needed
df.drop(['Offence Level 3 Description'], axis=1, inplace=True)

# change reported date to ISO format (YYYY-MM-DD)
df['Reported Date'] = pd.to_datetime(df['Reported Date'], format='%d/%m/%Y')

# rename columns
df.columns = ['Date', 'Suburb', 'Postcode', 'Offence Level 1 Description', 'Offence Level 2 Description', 'Offence Count']

In [50]:
# string of data with postcode, suburb, ditance from cbd, first line is not a header
radius50km = pd.read_csv('radius50kmcbd.csv', header=None)

# split data into columns, delimiter is ","
radius50km = radius50km[0].str.split(',', expand=True)

# rename columns
radius50km.columns = ['Postcode', 'Location', 'Distance from CBD (km)']

# drop location column
radius50km.drop(['Location'], axis=1, inplace=True)

# aggregate all rows by postcode, average the distance from cbd if not 0
radius50km = radius50km.groupby(['Postcode'], as_index=False).agg(
    {'Distance from CBD (km)': lambda x: x.astype(float).mean(skipna=True)}
)

In [52]:
# drop all rows from df if the postcode is not in the radius50km dataframe
df_50kmradius = df[df['Postcode'].isin(radius50km['Postcode'])]

# add the distance from cbd column to df_50kmradius
df_50kmradius = pd.merge(df_50kmradius, radius50km, on='Postcode')

In [57]:
# summarise offences by level 2 description
df_level2 = df_50kmradius.groupby(['Date', 'Suburb', 'Offence Level 2 Description'], as_index=False).agg(
    {'Postcode': 'first', 
     'Distance from CBD (km)': 'first',
     'Offence Level 1 Description': 'first', 
     'Offence Level 2 Description': 'first',
     'Offence Count': 'sum'
     }
)                  

In [60]:
# summarise offences by level 1 description
df_level1 = df_50kmradius.groupby(['Date', 'Suburb', 'Offence Level 1 Description'], as_index=False).agg(
    {'Postcode': 'first', 
     'Distance from CBD (km)': 'first',
     'Offence Level 1 Description': 'first', 
     'Offence Count': 'sum',}
)    

In [65]:
print(f"length of df {len(df)}")
print(f"length of df with only suburbs in 50km radius {len(df_50kmradius)}")
print(f"length of df_level2 {len(df_level2)}")
print(f"length of df_level1 {len(df_level1)}")

length of df 97078
length of df with only suburbs in 50km radius 77456
length of df_level2 68046
length of df_level1 54820


In [66]:
# export all to csv
df.to_csv('all_data_clean.csv', index=False)
df_50kmradius.to_csv('50kmradius_data_clean.csv', index=False)
df_level2.to_csv('level2_data_clean.csv', index=False)
df_level1.to_csv('level1_data_clean.csv', index=False)