##  loading the Required Variables

In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
import geopandas as gpd

### Loading the CSV file into a DataFrame

In [2]:
# Dataset loaded from (https://data.sa.gov.au/data/dataset/road-crash-data/resource/78d24425-6c14-426e-8895-d414c2a12521)
df = pd.read_csv('2019-2023_data_sa_as_at_20240913/2019-2023_DATA_SA_Crash.csv')

## Filtering from 2019 to 2022

### Converting the Year column to integer

In [3]:
df['Year'] = df['Year'].astype(int)

### Filtering the DataFrame to include only the records from 2019 to 2022

In [4]:
df = df[(df['Year'] >= 2019) & (df['Year'] <= 2022)]

## Converting 'Time' Column

In [5]:
df['Time'] = pd.to_datetime(df['Time'], format='%I:%M %p').dt.strftime('%H:%M')


## Add a new column 'State' with all values set to 'SA'

In [6]:
df['State'] = 'SA'

## Getting Latitude and longitude Details

### Loading the required second dataset (roadcrashes.csv) 

In [7]:

# Load the GeoJSON file and convert it directly to a DataFrame
gdf = gpd.read_file('RoadCrashes_geojson/RoadCrashes_GDA2020.geojson')
df2 = pd.DataFrame(gdf)

# Ensure UNIQUE_LOC is convertible to int32, then change its type
df2['UNIQUE_LOC'] = df2['UNIQUE_LOC'].astype(int)

# Change the data type of geometry column to object
df2['geometry'] = df2['geometry'].astype(str)

# Verify the changes
df2.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44420 entries, 0 to 44419
Data columns (total 27 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   UNIQUE_LOC              44420 non-null  int64 
 1   TOTAL_CRASHES           44420 non-null  int32 
 2   CSE_PDO                 44420 non-null  int32 
 3   CSE_INJ                 44420 non-null  int32 
 4   CSE_FAT                 44420 non-null  int32 
 5   CSE_SI                  44420 non-null  int32 
 6   TOTAL_CASUALTIES        44420 non-null  int32 
 7   TOTAL_FATALITIES        44420 non-null  int32 
 8   TOTAL_SERIOUS_INJURIES  44420 non-null  int32 
 9   CTY_REAR_END            44420 non-null  int32 
 10  CTY_HIT_FIXED_OBJECT    44420 non-null  int32 
 11  CTY_SIDE_SWIPE          44420 non-null  int32 
 12  CTY_RIGHT_ANGLE         44420 non-null  int32 
 13  CTY_HEAD_ON             44420 non-null  int32 
 14  CTY_HIT_PEDESTRIAN      44420 non-null  int32 
 15  CT

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50003 entries, 0 to 54166
Data columns (total 35 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   REPORT_ID         50003 non-null  object 
 1   Stats Area        50003 non-null  object 
 2   Suburb            50003 non-null  object 
 3   Postcode          50003 non-null  int64  
 4   LGA Name          49634 non-null  object 
 5   Total Units       50003 non-null  int64  
 6   Total Cas         50003 non-null  int64  
 7   Total Fats        50003 non-null  int64  
 8   Total SI          50003 non-null  int64  
 9   Total MI          50003 non-null  int64  
 10  Year              50003 non-null  int64  
 11  Month             50003 non-null  object 
 12  Day               50003 non-null  object 
 13  Time              50003 non-null  object 
 14  Area Speed        50003 non-null  int64  
 15  Position Type     50003 non-null  object 
 16  Horizontal Align  50003 non-null  object 
 17

### Merging two df and df2 datasets on the common column "UNIQUE_LOC" to extract the geometry column from 2 dataset and adding it to first dataset

In [9]:
# Merge df and df2 datasets on the common column "UNIQUE_LOC"
df = pd.merge(df, df2[['UNIQUE_LOC', 'geometry']], on='UNIQUE_LOC', how='left')

### Cleaning the geometry column and getting the lattitude and longitude in new columns

In [10]:
def parse_point(point):
    # Convert point to string in case it's not
    point = str(point)
    # Remove the 'POINT (' and ')' parts and split by space
    point = point.replace('POINT (', '').replace(')', '')
    # Split the remaining string into longitude and latitude
    parts = point.split()
    # Check if there are two parts, to avoid errors
    if len(parts) == 2:
        lon, lat = parts
        return float(lat), float(lon)
    else:
        return None, None  # Return None for both lat and lon if format is incorrect

# Apply the parsing function to the 'geometry' column and create new columns
df['latitude'], df['longitude'] = zip(*df['geometry'].apply(parse_point))

## Renaming the columns

In [11]:
df.rename(columns={'LGA Name': 'LGA', 'Area Speed': 'speed limit'}, inplace=True)

## Creating Loc_type column

### standardizing the location types by mapping from Position_Type to loc_type

In [12]:
intersection_types = [
    "Cross Road", "T-Junction", "Y-Junction", "Pedestrian Crossing", 
    "Multiple", "Rail Crossing", "Rail Xing", "Crossover", "Interchange"
]

In [13]:
df['loc_type'] = np.where(df['Position Type'].isin(intersection_types), 'Intersection', 'Midblock')

## Renaming the column name DayNight to Light

In [14]:

df.rename(columns={'DayNight': 'Light_cond'}, inplace=True)

## Replace the values 'Daylight' to 'Day' in the 'Light' column

In [15]:
df['Light_cond'] = df['Light_cond'].replace('Daylight', 'Day')

## Function to standardize column names

In [16]:
def standardize_column_names(columns):
    # Strip whitespace, replace spaces with underscores, and convert to lowercase
    standardized = [col.strip().replace(' ', '_').replace('-', '_').lower() for col in columns]
    return standardized

# Apply the function to the DataFrame's column names
df.columns = standardize_column_names(df.columns)

## Standardizing columns (Keeping only the columns required)

In [17]:
# List of standardized columns to keep
columns_to_keep = [
    'report_id',
    'year',
    'month',
    'day',
    'time',
    'state',
    'stats_area',
    'lga',
    'latitude',
    'longitude',
    'loc_type',
    'light_cond',
    'weather_cond',
    'speed_limit',
    'csef_severity'
]

# Select only the specified columns
df = df[columns_to_keep]

### Remove rows where 'lga', 'latitude','longitude' is an empty string

In [18]:

df = df.dropna(subset=['lga'])
df = df.dropna(subset='latitude')
df = df.dropna(subset='longitude')


### Remove rows where 'weather' is 'Unknown'

In [19]:
df = df[df['weather_cond'] != 'Unknown']

## Saving the modified DataFrame to a new CSV file

In [20]:
df.to_csv('WorkSA1.csv', index=False)