## Loading Required Variables

In [1]:
import pandas as pd
from datetime import datetime
import geopandas as gpd


# Load the datasets

In [2]:
# Dataset is downloaded from (https://discover.data.vic.gov.au/dataset/victoria-road-crash-data)
gdf = gpd.read_file('VICTORIAN_ROAD_CRASH_DATA.geojson')
vic_crash_data = pd.DataFrame(gdf)
accident_location = pd.read_csv('ACCIDENT_LOCATION.csv')
atmospheric_cond = pd.read_csv('ATMOSPHERIC_COND.csv')
road_surface_cond = pd.read_csv('ROAD_SURFACE_COND.csv')

## Merge the DataFrames

In [3]:
# Merge the datasets on 'ACCIDENT_NO'
df = pd.merge(vic_crash_data, accident_location, on='ACCIDENT_NO', how='left')
df = pd.merge(df, atmospheric_cond, on='ACCIDENT_NO', how='left')
df = pd.merge(df, road_surface_cond, on='ACCIDENT_NO', how='left')



## Creating a column 'crash_date_time'

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175447 entries, 0 to 175446
Data columns (total 67 columns):
 #   Column                  Non-Null Count   Dtype   
---  ------                  --------------   -----   
 0   ACCIDENT_NO             175447 non-null  object  
 1   ACCIDENT_DATE           175447 non-null  object  
 2   ACCIDENT_TIME           175447 non-null  object  
 3   ACCIDENT_TYPE           175447 non-null  object  
 4   DAY_OF_WEEK             175447 non-null  object  
 5   DCA_CODE                175447 non-null  object  
 6   DCA_CODE_DESCRIPTION    175447 non-null  object  
 7   LIGHT_CONDITION         175447 non-null  object  
 8   POLICE_ATTEND           175447 non-null  object  
 9   ROAD_GEOMETRY           175447 non-null  object  
 10  SEVERITY                175447 non-null  object  
 11  SPEED_ZONE              175447 non-null  object  
 12  RUN_OFFROAD             175447 non-null  object  
 13  ROAD_NAME_x             175447 non-null  object  
 14  ROAD

In [5]:
def convert_to_datetime(row):
    date_str = str(row['ACCIDENT_DATE'])
    time_str = str(row['ACCIDENT_TIME']).zfill(6)  # Ensure time is in hhmmss format
    
    year = int(date_str[:4])
    month = datetime.strptime(date_str[4:6], '%m').strftime('%B')  # Convert month number to month name
    day = row['DAY_OF_WEEK']
    
    hour = int(time_str[:2])
    minute = int(time_str[2:4])
    
    return year, month, day, f"{hour:02}:{minute:02}"

# Apply function to each row
df['year'], df['month'], df['day'], df['time'] = zip(*df.apply(convert_to_datetime, axis=1))

# Drop the original columns if needed
df.drop(columns=['ACCIDENT_DATE', 'ACCIDENT_TIME', 'DAY_OF_WEEK'], inplace=True)

In [6]:
df = df[(df['year'] >= 2019) & (df['year'] <= 2022)]

## Rename the 'ACCIDENT_NO' column to 'report_id'

In [7]:
df.rename(columns={'ACCIDENT_NO': 'report_id'}, inplace=True)

## Add a new column 'State' with all values set to 'VIC'

In [8]:
df['State'] = 'VIC'

## Creating the column 'Stats Area'

In [9]:
# Step 1: Rename the column
df.rename(columns={'DEG_URBAN_NAME': 'stats_area'}, inplace=True)

# Step 2: Replace specific values
area_mapping = {
    'MELBOURNE_CBD': '1 City',
    'MELB_URBAN': '2 Metropolitan',
    'SMALL_CITIES': '2 Metropolitan',
    'LARGE_PROVINCIAL_CITIES': '2 Metropolitan',
    'RURAL_VICTORIA': '3 Country',
    'TOWNS': '3 Country',
    'SMALL_TOWNS': '3 Country'
}
df['stats_area'] = df['stats_area'].replace(area_mapping)
df = df.dropna(subset=['stats_area'])

## Creating the 'lga' column name

In [10]:
# Step 1: Rename the column
df.rename(columns={'LGA_NAME': 'lga'}, inplace=True)

# Dropping blank data
df = df.dropna(subset=['lga'])
df = df[df['lga'] != ' ']

## Creating the speed_limit column 

In [11]:
# Assuming df is your DataFrame
df.rename(columns={'SPEED_ZONE': 'speed_limit'}, inplace=True)

# Filter out 'Unknown' speed limits first
df = df[~df['speed_limit'].isin(['Not known', 'Camping grounds or off road', 'Other speed limit'])]

# Remove ' km/h' and replace non-digit characters
df['speed_limit'] = df['speed_limit'].str.replace(' km/h', '').str.replace(r'\D', '', regex=True)

# Convert 'speed_limit' to integer
df['speed_limit'] = df['speed_limit'].astype(int)


### Creating the 'Loc_type' and 'Location' Column

In [12]:
def classify_location(row):
    if row['DISTANCE_LOCATION'] == 0:
        loc_type = 'Intersection'
        location = f"{row['ROAD_NAME_x']} {row['ROAD_TYPE_x']} & {row['ROAD_NAME_INT']} {row['ROAD_TYPE_INT']}"
    else:
        loc_type = 'Midblock'
        location = f"{row['ROAD_NAME_x']} {row['ROAD_TYPE_x']}"
    return pd.Series([location, loc_type])

# Apply the function to each row and create new columns
df[['Location', 'loc_type']] = df.apply(classify_location, axis=1)

## Creating weather_cond column

In [13]:
# Step 1: Rename the column
df.rename(columns={'ATMOSPH_COND_DESC': 'weather_cond'}, inplace=True)

# Step 2: Replace specific values
df['weather_cond'] = df['weather_cond'].replace({
    'Clear': 'Not Raining',
    'Dust': 'Not Raining',
    'Fog': 'Not Raining',
    'Smoke': 'Not Raining',
    'Strong winds': 'Not Raining',
    'Snowing': 'Raining'
})

# Step 3: Remove rows with 'Unknown'
df = df[df['weather_cond'] != 'Not known']


## Creating the light_cond column

In [14]:
# Step 1: Create a copy of the DataFrame to avoid SettingWithCopyWarning
df = df.copy()

# Step 2: Rename the column safely
df.rename(columns={'LIGHT_CONDITION': 'light_cond'}, inplace=True)

# Step 3: Replace specific values in 'light_cond'
df.loc[df['light_cond'] == 'Dusk/Dawn', 'light_cond'] = 'Day'

# Step 4: Replace values containing "Dark" with "Night"
df.loc[df['light_cond'].str.contains("Dark", na=False), 'light_cond'] = "Night"

# Step 5: Remove rows with 'Unknown' light conditions
df = df[df['light_cond'] != 'Unk.']

# Verify changes
print(df['light_cond'].head())

102125      Day
102126    Night
102127    Night
102129      Day
102130    Night
Name: light_cond, dtype: object


## Creating the csef_severity column

In [15]:


# Step 1: Rename the column
df.rename(columns={'SEVERITY': 'csef_severity'}, inplace=True)

# Step 2: Replace specific values
df['csef_severity'] = df['csef_severity'].replace({
    'Fatal accident': '3: Fatal',
    'Non injury accident': '1: PDO',
    'Other injury accident': '1: PDO',
    'Serious injury accident': '2: INJ'
})


## Function to standardize column names

In [16]:
def standardize_column_names(columns):
    # Strip whitespace, replace spaces with underscores, and convert to lowercase
    standardized = [col.strip().replace(' ', '_').replace('-', '_').lower() for col in columns]
    return standardized

# Apply the function to the DataFrame's column names
df.columns = standardize_column_names(df.columns)

## Standardizing columns (Keeping only the columns required)

In [17]:
# List of standardized columns to keep
columns_to_keep = [
    'report_id',
    'year',
    'month',
    'day',
    'time',
    'state',
    'stats_area',
    'lga',
    'latitude',
    'longitude',
    'loc_type',
    'location',
    'light_cond',
    'weather_cond',
    'speed_limit',
    'csef_severity'
]

# Select only the specified columns
df = df[columns_to_keep]

## Saving the modified DataFrame to a new CSV file

In [18]:
df.to_csv('Final_VIC.csv', index=False)