In [1]:
# load dependancies
import pandas as pd
import matplotlib.pyplot as plt
from ipywidgets import interact, widgets
from sqlalchemy import create_engine

In [3]:
# read in csv to pandas dataframe
raw_data = pd.read_csv('2022-23_data_sa_crime.csv')

In [4]:
df = raw_data.copy()

# drop columns that are not needed
df.drop(['Offence Level 3 Description'], axis=1, inplace=True)

# change reported date to ISO format (YYYY-MM-DD)
df['Reported Date'] = pd.to_datetime(df['Reported Date'], format='%d/%m/%Y')

# rename columns
df.columns = ['Date', 'Suburb', 'Postcode', 'Offence Level 1 Description', 'Offence Level 2 Description', 'Offence Count']

In [5]:
# string of data with postcode, suburb, ditance from cbd, first line is not a header
radius50km = pd.read_csv('radius50kmcbd.csv', header=None)

# split data into columns, delimiter is ","
radius50km = radius50km[0].str.split(',', expand=True)

# rename columns
radius50km.columns = ['Postcode', 'Location', 'Distance from CBD (km)']

# drop location column
radius50km.drop(['Location'], axis=1, inplace=True)

# aggregate all rows by postcode, average the distance from cbd if not 0
radius50km = radius50km.groupby(['Postcode'], as_index=False).agg(
    {'Distance from CBD (km)': lambda x: x.astype(float).mean(skipna=True)}
)

In [6]:
# drop all rows from df if the postcode is not in the radius50km dataframe
df_50kmradius = df[df['Postcode'].isin(radius50km['Postcode'])]

# add the distance from cbd column to df_50kmradius
df_50kmradius = pd.merge(df_50kmradius, radius50km, on='Postcode')

In [7]:
# summarise offences by level 2 description
df_level2 = df_50kmradius.groupby(['Date', 'Suburb', 'Offence Level 2 Description'], as_index=False).agg(
    {'Postcode': 'first', 
     'Distance from CBD (km)': 'first',
     'Offence Level 1 Description': 'first', 
     'Offence Level 2 Description': 'first',
     'Offence Count': 'sum'
     }
)                  

In [8]:
# summarise offences by level 1 description
df_level1 = df_50kmradius.groupby(['Date', 'Suburb', 'Offence Level 1 Description'], as_index=False).agg(
    {'Postcode': 'first', 
     'Distance from CBD (km)': 'first',
     'Offence Level 1 Description': 'first', 
     'Offence Count': 'sum',}
)    

In [9]:
print(f"length of df {len(df)}")
print(f"length of df with only suburbs in 50km radius {len(df_50kmradius)}")
print(f"length of df_level2 {len(df_level2)}")
print(f"length of df_level1 {len(df_level1)}")

length of df 97078
length of df with only suburbs in 50km radius 77456
length of df_level2 68046
length of df_level1 54820


In [10]:
# export all to csv
# df.to_csv('all_data_clean.csv', index=False)
# df_50kmradius.to_csv('50kmradius_data_clean.csv', index=False)
# df_level2.to_csv('level2_data_clean.csv', index=False)
# df_level1.to_csv('level1_data_clean.csv', index=False)

In [11]:
# dictionary of shorter terms
short_terms = {
    'Offence Level 1 Description': {
        'OFFENCES AGAINST THE PERSON': 'PERSON',
        'OFFENCES AGAINST PROPERTY': 'PROPERTY',
    },
    'Offence Level 2 Description': {
        'FRAUD DECEPTION AND RELATED OFFENCES': 'FRAUD',
        'PROPERTY DAMAGE AND ENVIRONMENTAL': 'PROPERTY DAMAGE',
        'SERIOUS CRIMINAL TRESPASS': 'TRESPASS',
        'THEFT AND RELATED OFFENCES': 'THEFT',
        'ACTS INTENDED TO CAUSE INJURY': 'INJURY',
        'OTHER OFFENCES AGAINST THE PERSON': 'OTHER',
        'ROBBERY AND RELATED OFFENCES': 'ROBBERY',
        'SEXUAL ASSAULT AND RELATED OFFENCES': 'SEXUAL ASSAULT',
        'HOMICIDE AND RELATED OFFENCES': 'HOMICIDE'
    }
}

In [12]:
# final dataframe is df_50kmradius with short terms
df_final = df_50kmradius.copy()
df_final.replace(short_terms, inplace=True)

In [13]:
df_final.to_csv('final_data_clean.csv', index=False)

In [14]:
%pip install --upgrade sqlalchemy

Note: you may need to restart the kernel to use updated packages.


In [2]:
# using sqlite and sqlalchemy, export df_final to a sql database
from sqlalchemy import create_engine
engine = create_engine('sqlite:///crime_data.db', echo=True)
sqlite_connection = engine.connect()
sqlite_table = "crime_data"
df_final.to_sql(sqlite_table, sqlite_connection, if_exists='replace')

# SELECT first ten entries from the database
sqlite_query = 'SELECT * FROM crime_data LIMIT 10'
df_sql = pd.read_sql_query(sqlite_query, sqlite_connection)

NameError: name 'df_final' is not defined

In [18]:
df_sql

Unnamed: 0,index,Date,Suburb,Postcode,Offence Level 1 Description,Offence Level 2 Description,Offence Count,Distance from CBD (km)
0,0,2022-07-01 00:00:00.000000,ADELAIDE,5000,PROPERTY,FRAUD,1,0.0
1,1,2022-07-01 00:00:00.000000,ADELAIDE,5000,PROPERTY,PROPERTY DAMAGE,1,0.0
2,2,2022-07-01 00:00:00.000000,ADELAIDE,5000,PROPERTY,PROPERTY DAMAGE,2,0.0
3,3,2022-07-01 00:00:00.000000,ADELAIDE,5000,PROPERTY,TRESPASS,1,0.0
4,4,2022-07-01 00:00:00.000000,ADELAIDE,5000,PROPERTY,THEFT,2,0.0
5,5,2022-07-01 00:00:00.000000,ADELAIDE,5000,PROPERTY,THEFT,8,0.0
6,6,2022-07-01 00:00:00.000000,ADELAIDE,5000,PERSON,INJURY,1,0.0
7,7,2022-07-02 00:00:00.000000,ADELAIDE,5000,PROPERTY,TRESPASS,1,0.0
8,8,2022-07-02 00:00:00.000000,ADELAIDE,5000,PROPERTY,THEFT,2,0.0
9,9,2022-07-02 00:00:00.000000,ADELAIDE,5000,PROPERTY,THEFT,7,0.0


In [145]:
import matplotlib.pyplot as plt
from ipywidgets import widgets, interactive_output, HBox, VBox
from IPython.display import display
from sqlalchemy import create_engine

sqlite_connection = create_engine('sqlite:///crime_data.db', echo=False).connect()
df = pd.read_sql_query('SELECT * FROM crime_data', sqlite_connection)

df['Date'] = pd.to_datetime(df['Date'])

suburbs = ['All Suburbs'] + sorted(df['Suburb'].unique())
offence_level_2_values = sorted(df['Offence Level 2 Description'].unique())

suburb_dropdown = widgets.Dropdown(options=suburbs, description='Suburb:')
offence_level_2_dropdown = widgets.Dropdown(options=['', 'All Offences'] + offence_level_2_values, description='Offence:')

image_path = 'policelogo.png'
image_widget = widgets.Image(value=open(image_path, "rb").read(), format='png', width=100, height=100)

def update_plots(suburb, offence_level_2=None):
    plt.figure(figsize=(12, 15))

    plt.subplot(2, 1, 1)

    if suburb == 'All Suburbs':
        filtered_df = df  
        title = 'All Suburbs'
    else:
        filtered_df = df[df['Suburb'] == suburb]
        title = f'Offences in Suburb: {suburb}'

    total_offence_count = filtered_df.groupby('Offence Level 2 Description')['Offence Count'].sum()

    #BAR CHART
    
    if not offence_level_2:
        
        bars = total_offence_count.sort_values(ascending=False).plot(kind='bar', color='lightcoral', edgecolor='indianred', hatch='++', zorder=1)
        plt.title(title, fontweight='bold', fontname='Calibri',fontsize=20) 
        plt.xlabel('Offence Description', fontweight='bold', fontname='Calibri') 
        plt.ylabel('Number of Offences', fontweight='bold', fontname='Calibri') 

        for p in bars.patches:
            label_text = f"{int(p.get_height())}"
            bbox_props = dict(boxstyle="round,pad=0.3", edgecolor='none', facecolor='lightcoral', alpha=0.5)
            plt.annotate(
                label_text,
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 10), textcoords='offset points',
                bbox=bbox_props,
                weight='bold',
                fontname='Arial',
                fontsize=11
            )

        plt.xticks(rotation=0)
        plt.grid(axis='y', linestyle='dashed')
        plt.gca().set_axisbelow(True)

    else:
        plt.subplot(2, 1, 1)
        filtered_df = df[df['Offence Level 2 Description'] == offence_level_2]
        top_suburbs = filtered_df.groupby('Suburb')['Offence Count'].sum().nlargest(10).index
        filtered_df = df[(df['Offence Level 2 Description'] == offence_level_2) & (df['Suburb'].isin(top_suburbs))]
        total_offence_count = filtered_df.groupby('Suburb')['Offence Count'].sum()

        # BAR CHART 2
        bars = total_offence_count.sort_values(ascending=False).plot(kind='bar', color='lightcoral', edgecolor='indianred', hatch='++', zorder=1)
        plt.title(f'Top 10 Suburbs for Offence: {offence_level_2}', fontweight='bold', fontname='Calibri',fontsize=20)  # Specify fontweight and fontname)
        plt.xlabel('Suburb', fontweight='bold', fontname='Calibri')  
        plt.ylabel('Number of Offences', fontweight='bold', fontname='Calibri')  

        for p in bars.patches:
            label_text = f"{int(p.get_height())}"
            bbox_props = dict(boxstyle="round,pad=0.3", edgecolor='none', facecolor='lightcoral', alpha=0.5)
            plt.annotate(
                label_text,
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 10), textcoords='offset points',
                bbox=bbox_props,
                weight='bold',
                fontname='Arial',
                fontsize=11
            )

        plt.xticks(rotation=20)
        plt.grid(axis='y', linestyle='dashed')
        plt.gca().set_axisbelow(True)

    plt.subplot(2, 1, 2)

    # PIE CHART
    if offence_level_2:
       
        pie_colors = ['#ff392e', '#ff3126', '#ff1a0e', '#fc0c00', '#f40c00', '#d40a00', '#ffebea', '#fedcda', '#fec4c1', '#ffada9', '#ff9690', '#ff675f']
    
        explode = [0.1 if perc == max(filtered_df['Date'].dt.month.value_counts(normalize=True).sort_index() * 100) else 0 for perc in filtered_df['Date'].dt.month.value_counts(normalize=True).sort_index() * 100]
    
        plt.pie(filtered_df['Date'].dt.month.value_counts(normalize=True).sort_index() * 100, labels=filtered_df['Date'].dt.month_name().unique(), autopct='%1.1f%%', startangle=90, colors=pie_colors, explode=explode)
        plt.title(f'Monthly Distribution for: {offence_level_2}',fontweight='bold', fontname='Calibri',fontsize=20)
        plt.axis('equal')  
        plt.tight_layout()
        
        
    else:
        # LINE CHART
        likelihood_percentage_selected_suburb = (total_offence_count / filtered_df['Offence Count'].sum()) * 100
        likelihood_percentage_all_suburbs = (df.groupby('Offence Level 2 Description')['Offence Count'].sum() / df['Offence Count'].sum()) * 100
        likelihood_percentage_all_suburbs = likelihood_percentage_all_suburbs.sort_values(ascending=False)
        x_all_suburbs = range(len(likelihood_percentage_all_suburbs))

        lines = plt.plot(x_all_suburbs, likelihood_percentage_all_suburbs.values, marker='o', linestyle='dashed', label='Avg Likelihood %', color='black')

        for x, y in zip(x_all_suburbs, likelihood_percentage_all_suburbs.values):
            plt.text(x, y, f'{y:.0f}%', ha='left', va='bottom', fontsize=10, color='white', weight='bold', bbox=dict(boxstyle='round,pad=0.3', edgecolor='none', facecolor='black'))

        common_offences = list(set(likelihood_percentage_selected_suburb.index) & set(likelihood_percentage_all_suburbs.index))
        likelihood_percentage_selected_suburb[common_offences].sort_values(ascending=False).plot(
            marker='o', label=f'{suburb} Likelihood %', color='lightcoral'
        )

        for x, y in zip(x_all_suburbs, likelihood_percentage_selected_suburb[common_offences].sort_values(ascending=False).values):
            plt.text(x, y, f'{y:.0f}%', ha='right', va='bottom', fontsize=10, color='white', weight='bold',
                     bbox=dict(boxstyle='round,pad=0.3', edgecolor='none', facecolor='red'))

        plt.xticks(x_all_suburbs, likelihood_percentage_all_suburbs.index, rotation=0, ha='center',fontweight='bold', fontname='Calibri')
        plt.grid(axis='x', linestyle='dashed')
        plt.gca().set_axisbelow(True)
        plt.xlabel('Offence', fontweight='bold', fontname='Calibri', fontsize=12)
        plt.ylabel('likelihood %', fontweight='bold', fontname='Calibri', fontsize=12)
        plt.title(f'Chance of crime occurring in: {suburb}',fontweight='bold', fontname='Calibri',fontsize=20)
        plt.legend()
        
        plt.tight_layout()
        
        #PIE CHART 2
        
    if not offence_level_2:
        filtered_df = df if suburb == 'All Suburbs' else df[df['Suburb'] == suburb]
        offenses_count_per_type = filtered_df.groupby("Offence Level 1 Description")["Offence Count"].sum()

        labels = offenses_count_per_type.index
        size = offenses_count_per_type.values

        custom_colors = ['#ffc9bb', '#ff8164']

        fig, ax = plt.subplots()
        ax.pie(size, labels=labels, autopct='%1.1f%%', startangle=90, colors=custom_colors)
        ax.axis('equal')  
        
        ax.pie(size, labels=labels, autopct='%1.1f%%', startangle=90, colors=custom_colors, wedgeprops=dict(width=0.3))
        ax.axis('equal') 

        plt.title(f'Offence against % for: {suburb if suburb != "All Suburbs" else "All Suburbs"}',fontweight='bold', fontname='Calibri',fontsize=20)
        
        plt.subplots_adjust(left=0.1, right=1.9, top=0.9, bottom=0.1)

        plt.show()
    
out = interactive_output(update_plots, {'suburb': suburb_dropdown, 'offence_level_2': offence_level_2_dropdown})

hbox = HBox([image_widget, VBox([suburb_dropdown, offence_level_2_dropdown])])

layout = VBox([hbox, out])

display(layout)

VBox(children=(HBox(children=(Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x04\xb0\x00\x00\x04\xc…