# Crime Statistics

In [102]:
def add_zero(string):
    number = int(string)
    if number > 9:
        return str(number)
    else:
        return f"0{number}"
    
def str_to_int(string):
    if string == '' or string == 'nan':
        return ''
    try:
        return int(string)
    except:
        return string

## Map Postal Code to State (Code & Name)

In [103]:
import pandas

postal_code_data_frame = pandas.read_csv("../postal-code.csv")

postal_codes_dict = {}

for columns in postal_code_data_frame.values:
    postal_code = columns[0]
    place_name = columns[1]
    state_name = columns[2]
    state_code = columns[3]
    key = "-".join(place_name.upper().split(" "))
    postal_codes_dict[key] = {
        "state_code": state_code,
        "state_name": state_name
    }



## Merge all files

In [104]:
start_year = 10
end_year = 21

data = []

for year in range(start_year, end_year + 1):
    file_name = f"raw/20{year}-{year + 1}.csv"
    print(file_name)
    crimes_by_year_data_frame = pandas.read_csv(file_name)
    for columns in crimes_by_year_data_frame.values:
        full_date = list(map(add_zero, columns[0].split("/")))
        full_date.reverse()
        year, month, date = full_date
        suburb = columns[1]
        postal_code = str_to_int(str(columns[2]))
        description_level_1 = columns[3]
        description_level_2 = columns[4]
        description_level_3 = columns[5]
        count = columns[6]
        key = "-".join(str(suburb).upper().split(" "))
        state = postal_codes_dict[key] if key in postal_codes_dict else {
            "state_code": "",
            "state_name": ""
        }
        if suburb == "NOT DISCLOSED":
            state = {
                "state_code": "NOT DISCLOSED",
                "state_name": "NOT DISCLOSED"
            }
        data.append({
          "year": year,
          "month": month,
          "date": date,
          "suburb":suburb,
          "postal_code": postal_code,
          "state_code": state["state_code"],
          "state_name": state["state_name"],
          "description_level_1":description_level_1,
          "description_level_2":description_level_2,
          "description_level_3":description_level_3,
          "count": count,
        })

print(len(data))

raw/2010-11.csv
raw/2011-12.csv
raw/2012-13.csv
raw/2013-14.csv
raw/2014-15.csv
raw/2015-16.csv
raw/2016-17.csv
raw/2017-18.csv
raw/2018-19.csv
raw/2019-20.csv
raw/2020-21.csv
raw/2021-22.csv
1121724


## Write to File

In [105]:
df = pandas.DataFrame.from_dict(data)
df.to_csv (r'clean/2010-22.csv', index = False, header=True)

## Import to PostgreSQL

In [106]:

# import psycopg2
  
# conn = psycopg2.connect(database="australia",
#                         user='username', password='password', 
#                         host='localhost', port='5432'
# )
  
# conn.autocommit = True
# cursor = conn.cursor()
  
# copy_sql = '''
# COPY public."CrimeStatistics" (date,suburb,postal_code,description_level_1,description_level_2,description_level_3,count)
# FROM '/Users/hieudoan/Documents/github.com/hieudoanm/australia/data/crime-statistics/clean/2010-22.csv'
# DELIMITER ','
# CSV HEADER;
# '''
  
# cursor.execute(copy_sql)

# conn.commit()
# conn.close()