In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
import requests
import time
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from meteostat import Point, Daily

In [2]:
# Dataset obtained from (https://portal-mainroads.opendata.arcgis.com/datasets/mainroads::crash-information-last-5-years/explore)
df = pd.read_csv('Crash_Information_(Last_5_Years).csv')

In [3]:
print(df.head())

            X          Y  OBJECTID    ACC_ID  ROAD_NO          ROAD_NAME  \
0  115.782419 -31.979736  51192360  10973801  1150022         Shenton Rd   
1  115.919445 -32.012540  51192361  10973806     H012          Leach Hwy   
2  115.786907 -31.978241  51192362  10973812  1150027         Reserve St   
3  117.853460 -34.987227  51192363  10973819     H001         Albany Hwy   
4  115.968798 -31.901419  51192364  10973834     H005  Great Eastern Hwy   

  COMMON_ROAD_NAME CWAY     SLK  INTERSECTION_NO  ... ACCIDENT_TYPE  \
0       Shenton Rd    S    1.47              NaN  ...      Midblock   
1        Leach Hwy    L   16.13           4455.0  ...  Intersection   
2       Reserve St    S    0.43              NaN  ...      Midblock   
3       Albany Hwy    S  402.13              NaN  ...      Midblock   
4       Johnson St    S   11.02          14738.0  ...  Intersection   

    SEVERITY         EVENT_NATURE            EVENT_TYPE  TOTAL_BIKE_INVOLVED  \
0  PDO Major  Sideswipe Same Dirn   

In [4]:
df.rename(columns={'OBJECTID': 'report_id'}, inplace=True)


In [5]:
# Remove rows with blank or NaN CRASH_TIME
df = df.dropna(subset=['CRASH_TIME'])

# Convert CRASH_TIME to integers to remove decimal points
df['CRASH_TIME'] = df['CRASH_TIME'].astype(int)

# Function to convert CRASH_DATE and CRASH_TIME to year, month, day, and time
def convert_datetime(row):
    date_str = row['CRASH_DATE']
    time_str = str(row['CRASH_TIME']).zfill(4)  # Ensure time is in hhmm format

    # Parse date and time
    try:
        date_obj = datetime.strptime(date_str, '%d/%m/%Y')
    except ValueError as e:
        print(f"Error parsing date: {e}, value: {date_str}")
        return None, None, None, None

    try:
        time_obj = datetime.strptime(time_str, '%H%M')
    except ValueError as e:
        print(f"Error parsing time: {e}, value: {time_str}")
        return None, None, None, None

    # Extract year, month, day, weekday, and formatted time
    year = date_obj.year
    month = date_obj.strftime('%B')
    day = date_obj.strftime('%A')  # Full weekday name
    time = time_obj.strftime('%H:%M')

    return year, month, day, time

# Apply the function to each row
df[['year', 'month', 'day', 'time']] = df.apply(convert_datetime, axis=1, result_type='expand')



In [6]:
print(df[['year', 'month', 'day', 'time']].head())

   year      month       day   time
0  2023  September    Friday  12:30
1  2023  September   Tuesday  14:35
2  2023  September    Monday  20:10
3  2023  September   Tuesday  15:05
4  2023  September  Thursday  08:05


In [7]:
df = df[(df['year'] >= 2019) & (df['year'] <= 2022)]

In [8]:
df['state'] = 'WA'

In [9]:
# Data set downloaded from (https://portal-mainroads.opendata.arcgis.com/datasets/mainroads::legal-speed-limits/about)
legal_speed_limits_path = 'Legal_Speed_Limits.csv'
legal_speed_limits_df = pd.read_csv(legal_speed_limits_path)

# Merge the dataframes on ROAD_NO and Road
merged_df = df.merge(legal_speed_limits_df, left_on='ROAD_NO', right_on='ROAD', how='left')

# Extract the required columns
df['stats_area'] = merged_df['RA_NAME']
df['lga'] = merged_df['LG_NAME']
df['speed_limit'] = merged_df['SPEED_LIMIT']

print(df[['stats_area', 'lga', 'speed_limit']].tail())


                    stats_area           lga speed_limit
125690  Goldfields - Esperance       Yilgarn      60km/h
125860            Metropolitan  Nedlands (C)      70km/h
126279            Metropolitan      Swan (C)     110km/h
126402            Metropolitan   Kwinana (C)      70km/h
126689            Metropolitan  Gosnells (C)     100km/h


In [10]:

# Drop blank columns
df = df.dropna(axis=1, how='all')

# Map the stats_area values
mapping = {
    'Metropolitan': '1 City',
    'South West': '2 Metropolitan'
}
df['stats_area'] = df['stats_area'].map(lambda x: mapping.get(x, '3 Country'))

print(df[['stats_area', 'lga', 'speed_limit']].head())

     stats_area           lga speed_limit
776      1 City     Perth (C)      80km/h
1525     1 City  Cockburn (C)     100km/h
2008     1 City      Swan (C)      60km/h
2009  3 Country     Cunderdin      70km/h
2010     1 City     Mundaring      70km/h


In [11]:
# Rename the ACCIDENT_TYPE column to loc_type
df.rename(columns={'ACCIDENT_TYPE': 'loc_type'}, inplace=True)

In [12]:
# Define a function to create the location column
def determine_location(row):
    if row['loc_type'] == 'Intersection':
        return row['INTERSECTION_DESC']
    elif row['loc_type'] == 'Midblock':
        return row['ROAD_NAME']
    else:
        return None

# Apply the function to create the location column
df['location'] = df.apply(determine_location, axis=1)

# Remove rows where loc_type is 'Roads Open To Public Access' or location is None
df = df[~df['loc_type'].isin(['Roads Open To Public Access']) & df['location'].notna()]


In [13]:
print(df[['location']].head())

                                 location
776                   Norma Rd & McCoy St
1525                         Mitchell Fwy
2008  Jandakot Rd & Berrigan Dr & Dean Rd
2009            Spearwood Av & Beeliar Dr
2010                          Waterloo St


In [14]:
# Function to determine day or night based on time
def determine_light_cond(crash_time):
    time_str = str(crash_time).zfill(4)  # Ensure time is in hhmm format
    time_obj = datetime.strptime(time_str, '%H%M').time()
    if datetime.strptime('0600', '%H%M').time() <= time_obj <= datetime.strptime('1800', '%H%M').time():
        return 'Day'
    else:
        return 'Night'

# Apply the function to create the light_cond column
df['light_cond'] = df['CRASH_TIME'].apply(determine_light_cond)

In [15]:
# Function to extract numbers from speed limit strings
def extract_speed_limit(speed_limit):
    if pd.isna(speed_limit):
        return None
    if isinstance(speed_limit, (int, float)):
        return int(speed_limit)
    return int(re.search(r'\d+', str(speed_limit)).group())

# Apply the function to the SPEED_LIMIT column
df['speed_limit'] = df['speed_limit'].apply(extract_speed_limit)

# Drop rows with NaN values in SPEED_LIMIT
df.dropna(subset=['speed_limit'], inplace=True)

# Display the modified DataFrame
print(df)

                 X          Y  report_id    ACC_ID  ROAD_NO  \
776     115.822312 -32.043478   51193136  10972358  1190010   
1525    115.741337 -31.700702   51193885  10979019     H016   
2008    115.859812 -32.107206   51194368  10608891  1030503   
2009    115.811400 -32.128511   51194369  10225227  1030007   
2010    115.829601 -31.895244   51194370  10377018  1253051   
...            ...        ...        ...       ...      ...   
125690  115.900267 -31.963559   51318050  10935470  1290262   
125860  115.851638 -31.951824   51318220  10964230  1240106   
126279  116.451903 -31.479100   51318639  10936753  4260117   
126402  118.462401 -20.531356   51318762  10955456     H006   
126689  115.827065 -31.945384   51319049  10956091  1270005   

                 ROAD_NAME    COMMON_ROAD_NAME CWAY      SLK  INTERSECTION_NO  \
776               Norma Rd            Norma Rd    S     0.29          47136.0   
1525          Mitchell Fwy        Mitchell Fwy    L    32.24              NaN   


In [16]:


# Mapping dictionary for severity levels
severity_mapping = {
    'Fatal': '4.Fatal',
    'Hospital': '3.SI',
    'Medical': '2.MI',
    'PDO Major': '1.PDO',
    'PDO Minor': '1.PDO'
}

# Apply the mapping to create the csef_severity column
df['csef_severity'] = df['SEVERITY'].map(severity_mapping)

In [17]:


# Function to fetch daily weather data with retries (keeping the logic here but without saving cache)
def fetch_weather(date, lat, lon, retries=5):
    for i in range(retries):
        try:
            dt = datetime.strptime(date, '%d/%m/%Y')
            location = Point(lat, lon)
            weather_data = Daily(location, start=dt, end=dt)
            weather_data = weather_data.fetch()
            if not weather_data.empty:
                precipitation = weather_data.iloc[0]['prcp']
                return 'Raining' if precipitation > 0 else 'Not Raining'
        except Exception as e:
            print(f"Error: {e}. Retrying... ({i + 1}/{retries})")
            if i < retries - 1:
                time.sleep(2 ** i)  
    return 'Timeout'

# Apply parallel weather fetching
def parallel_weather_fetching(df):
    with ThreadPoolExecutor(max_workers=50) as executor:
        futures = {executor.submit(fetch_weather, row['CRASH_DATE'], row['LATITUDE'], row['LONGITUDE']): index for index, row in df.iterrows()}
        for future in as_completed(futures):
            index = futures[future]
            try:
                result = future.result()
                df.at[index, 'rain_condition'] = result
                print(f"Processed row {index}: {result}")
            except Exception as e:
                print(f"Error processing row {index}: {e}")
                df.at[index, 'rain_condition'] = 'Error'

# Apply the parallel fetching
parallel_weather_fetching(df)

# Display the modified DataFrame
print(df)

# Remove rows where 'rain_condition' is 'Error'
df = df[df['rain_condition'] != 'Error']

# Replace 'Timeout' values
num_timeout = df['rain_condition'].value_counts().get('Timeout', 0)
np.random.seed(0)  # For reproducibility
replacement_values = np.random.choice(['Not Raining', 'Raining'], size=num_timeout, p=[0.7, 0.3])
df.loc[df['rain_condition'] == 'Timeout', 'rain_condition'] = replacement_values

# Rename rain_condition to weather_cond
df.rename(columns={'rain_condition': 'weather_cond'}, inplace=True)



Processed row 776: Raining
Processed row 2146: Not Raining
Processed row 2080: Not Raining
Processed row 2076: Raining
Processed row 2030: Not Raining
Processed row 2084: Raining
Processed row 2049: Raining
Processed row 2046: Not Raining
Processed row 2085: Raining
Processed row 2008: Not Raining
Processed row 2086: Not Raining
Processed row 2088: Raining
Processed row 2152: Not Raining
Processed row 2017: Raining
Processed row 2091: Raining
Processed row 2063: Not Raining
Processed row 2041: Raining
Processed row 2092: Not Raining
Processed row 1525: Not Raining
Processed row 2020: Not Raining
Processed row 2093: Not Raining
Processed row 2067: Not Raining
Processed row 2015: Not Raining
Processed row 2028: Not Raining
Processed row 2097: Not Raining
Processed row 2099: Not Raining
Processed row 2101: Raining
Processed row 2037: Raining
Processed row 2034: Not Raining
Processed row 2102: Raining
Processed row 2053: Not Raining
Processed row 2043: Not Raining
Processed row 2105: Raini

In [18]:
# Standardize column names
def standardize_column_names(columns):
    return [col.strip().replace(' ', '_').replace('-', '_').lower() for col in columns]

df.columns = standardize_column_names(df.columns)

# Specify columns to keep
columns_to_keep = [
    'report_id',
    'year',
    'month',
    'day',
    'time',
    'state',
    'stats_area',
    'lga',
    'latitude',
    'longitude',
    'loc_type',
    'location',
    'light_cond',
    'weather_cond',
    'speed_limit',
    'csef_severity'
]

# Select only the specified columns
df = df[columns_to_keep]

# Save the final DataFrame to WorkWA1.csv
df.to_csv('Final_WA.csv', index=False)