# The purpose of this program is to create safety scores, safety ratings, and accident information for crashes in Melbourne


##  Reading in data and preprocessing

In [1]:
library(reshape2)
library(dplyr)
library(hash)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Loading required package: Matrix
Loading required package: foreach
Loaded glmnet 2.0-16

hash-2.2.6.1 provided by Decision Patterns



In [1]:
crashes=read.csv("Crashes_Last_Five_Years.csv", stringsAsFactors = FALSE)

To get the safety rating of each road, we look at the total cost to society, including average cost of damge to vehicles, cost of a fatality, cost of a serious injury and so on. All figures were sourced from the article below:

https://www.aaa.asn.au/wp-content/uploads/2018/03/AAA-National-Road-Safety-Platform_Sep-2017.pdf

In [2]:
crashes$FATALITY = crashes$FATALITY*4.339
crashes$SERIOUSINJURY = crashes$SERIOUSINJURY*0.239
crashes$OTHERINJURY = crashes$OTHERINJURY*0.012
crashes$NONINJURED = 0.003*crashes$NO_OF_VEHICLES
crashes$COST = crashes$FATALITY + crashes$SERIOUSINJURY + crashes$OTHERINJURY + crashes$NONINJURED

names(crashes) = tolower(names(crashes))


In [7]:
# get the roads for accidents
crashes$road = (paste(crashes$road_name,crashes$road_name_int, sep = ","))
crashes$road = (trimws(crashes$road))
crashes = crashes[crashes$road!=",",] 

# get accident roads for meblourne
melbs = crashes[crashes$deg_urban_name %in% c('MELB_URBAN', 'MELBOURNE_CBD'),]

In [5]:
# get year of crashes
get_year= function (value) {
    name = as.character(value)
    digits = unlist(strsplit(name, "/"))
    return (as.numeric(digits[3]))
}

crashes$year = as.numeric(unlist(lapply(crashes$accidentdate,get_year)))


## Scoring and Rating

With all the preprocessing finished, we have to get the costs for each road in 2018. WWe use these score roads (1-100), and assign a rating (1-3).

In [8]:
# get road costs by year
yearly = crashes %>% group_by(road,year) %>% summarise(cost =sum(cost))

# get road costs in 2018 only
x2018 = yearly %>% filter (year==2018) 

# for other roads which did not have a crash in 2018, we add them, and assign a cost of zero
roads =  crashes %>% group_by(road)%>% summarise(cost = 0)
all2018 = merge(roads,x2018, by=c("road"),all.x=TRUE)
all2018[is.na(all2018$cost.y),"cost.y"]=0

all2018$year = 2018

To assign a score to each road, we find the quantile of each cost greater than zero and multiply by 100. 
Costs of zero are automatically assigned a score of 1.

In [12]:
riskier = all2018[(all2018$cost.y>0),]

get_percentile = ecdf(riskier$cost.y)
assign_score = function (cost){
    value = get_percentile(cost)
    value = if (value == 0) 1 else value*100
    
    return (value)
}

all2018$score = as.numeric(unlist(lapply(all2018$cost.y,assign_score)))

Once roads, are scored we can assign a rating. We decided to use 70 and 95 as boundaries for the ratings.

In [13]:
get_rating = function (score){
   
    val = 3

    if (score<=70){
        val = 1

    }
    if (score >70 & score<=95){
        val = 2
    }
            
    return (val)
}
all2018$rating = as.numeric(unlist(lapply(all2018$score,get_rating)))

# get only roads in melbourne
melbs = merge(melbs,all2018,by = c('road'), all.x = TRUE)

##  Voice Alert Information

For voice alerts, we provide drive with the accident type description and road geometry.

In [14]:
# melbs data for 2018 only
current = melbs[melbs$year.x==2018,]

# get the most frequent descriptions and road geometry per road
accident_type_info = current %>% group_by(road) %>% count(accident.type.desc) %>% top_n(1) %>% distinct(road,n, .keep_all= TRUE) %>% select(c(road,accident.type.desc))
road_geom_info = current %>% group_by(road) %>% count(road.geometry.desc) %>% top_n(1) %>% distinct(road,n, .keep_all= TRUE) %>% select(c(road,road.geometry.desc))

In [17]:
# create a dictionary of road to info and road geometry
create_dict = function(df){
    h = hash() 
    for (i in 1:nrow(df)){
        key = df[i,1][[1]]
        val = df[i,2][[1]]
        h[[key]] = val
    }
    return(h)
} 
accident_type_dict = create_dict(accident_type_info)
road_geom_dict = create_dict(road_geom_info)

In [18]:
# create the information for voice alerts
create_voice_info = function(road){  
    accident_type = accident_type_dict[[road]]
    road_geom = road_geom_dict[[road]]
   
    
    
    if (accident_type == "Collision with vehicle"){
        accident_type = "high vehicle collisions"
    }
    else if (accident_type == "Struck Pedestrian"){
        accident_type = "high struck pedestrians"
    }
    else if (accident_type == "Struck animal"){
        accident_type = "high struck animals"
    }
    else if (accident_type == "Collision with a fixed object"){
        accident_type = "high object collision"
    }
    else{
        accident_type = "high risk"
    }
    
    if (road_geom %in% c("Not at intersection", "Unknown", "Road closure", "Road closure'")){
        road_geom = "road"
    }
    return ( paste(accident_type, "at" , road_geom))
}

current$voice_info = unlist(lapply(current$road,create_voice_info))
road_acc_info_2018 = current %>% distinct(road,voice_info) 

In [21]:
# get voice info for all roads in melbourne. roads without crashes have "None" as information
new_all = merge(melbs,road_acc_info_2018,by =  c('road'), all.x = TRUE)
new_all[is.na(new_all$voice_info),"voice_info"]="None"

data = new_all[,c("rating","score","voice_info","x","y")]

# risky roads which were missing info are assigned
data[data$voice_info=="None" & data$rating==3,"voice_info"] = "high risk at road"

In [None]:
# write data
write.table(data, file = "safety_data.csv", sep=",",  col.names=FALSE, row.names = FALSE)

------