In [1]:
# Libraries
import numpy as np
import pandas as pd
import mlxtend

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
# reading dataset
df = pd.read_csv('../dim_tables/cleaned_combined_fatalities_2024.csv')
df

Unnamed: 0,Crash ID,State,Month,Year,Day,Time,Crash Type,Bus Involvement,Heavy Rigid Truck Involvement,Articulated Truck Involvement,Speed Limit,Road User,Gender,Age,Christmas Period,Easter Period,Age Group,Day Type,Time of Day,Number Fatalities
0,20241115,NSW,12,2024,Friday,04:00,Single,No,No,No,100,Driver,Male,74,Yes,No,65_to_74,Weekday,Night,1.0
1,20241125,NSW,12,2024,Friday,06:15,Single,No,No,No,80,Driver,Female,19,No,No,17_to_25,Weekday,Day,1.0
2,20246013,Tas,12,2024,Friday,09:43,Multiple,No,No,No,50,Driver,Female,33,Yes,No,26_to_39,Weekday,Day,1.0
3,20241002,NSW,12,2024,Friday,10:35,Multiple,No,No,No,100,Driver,Female,32,No,No,26_to_39,Weekday,Day,1.0
4,20243185,Qld,12,2024,Friday,13:00,Multiple,No,No,No,100,Passenger,Female,61,No,No,40_to_64,Weekday,Day,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54934,19896006,Tas,1,1989,Wednesday,20:20,Multiple,No,No,Yes,100,Passenger,Male,13,No,No,0_to_16,Weekday,Night,6.0
54935,19896006,Tas,1,1989,Wednesday,20:20,Multiple,No,No,Yes,100,Passenger,Female,11,No,No,0_to_16,Weekday,Night,6.0
54936,19896006,Tas,1,1989,Wednesday,20:20,Multiple,No,No,Yes,100,Passenger,Female,13,No,No,0_to_16,Weekday,Night,6.0
54937,19896006,Tas,1,1989,Wednesday,20:20,Multiple,No,No,Yes,100,Driver,Male,18,No,No,17_to_25,Weekday,Night,6.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54939 entries, 0 to 54938
Data columns (total 20 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Crash ID                       54939 non-null  int64  
 1   State                          54939 non-null  object 
 2   Month                          54939 non-null  int64  
 3   Year                           54939 non-null  int64  
 4   Day                            54939 non-null  object 
 5   Time                           54939 non-null  object 
 6   Crash Type                     54939 non-null  object 
 7   Bus Involvement                54939 non-null  object 
 8   Heavy Rigid Truck Involvement  54939 non-null  object 
 9   Articulated Truck Involvement  54939 non-null  object 
 10  Speed Limit                    54939 non-null  object 
 11  Road User                      54939 non-null  object 
 12  Gender                         54939 non-null 

In [4]:
# remove ID
df = df.drop(columns='Crash ID')
df

Unnamed: 0,State,Month,Year,Day,Time,Crash Type,Bus Involvement,Heavy Rigid Truck Involvement,Articulated Truck Involvement,Speed Limit,Road User,Gender,Age,Christmas Period,Easter Period,Age Group,Day Type,Time of Day,Number Fatalities
0,NSW,12,2024,Friday,04:00,Single,No,No,No,100,Driver,Male,74,Yes,No,65_to_74,Weekday,Night,1.0
1,NSW,12,2024,Friday,06:15,Single,No,No,No,80,Driver,Female,19,No,No,17_to_25,Weekday,Day,1.0
2,Tas,12,2024,Friday,09:43,Multiple,No,No,No,50,Driver,Female,33,Yes,No,26_to_39,Weekday,Day,1.0
3,NSW,12,2024,Friday,10:35,Multiple,No,No,No,100,Driver,Female,32,No,No,26_to_39,Weekday,Day,1.0
4,Qld,12,2024,Friday,13:00,Multiple,No,No,No,100,Passenger,Female,61,No,No,40_to_64,Weekday,Day,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54934,Tas,1,1989,Wednesday,20:20,Multiple,No,No,Yes,100,Passenger,Male,13,No,No,0_to_16,Weekday,Night,6.0
54935,Tas,1,1989,Wednesday,20:20,Multiple,No,No,Yes,100,Passenger,Female,11,No,No,0_to_16,Weekday,Night,6.0
54936,Tas,1,1989,Wednesday,20:20,Multiple,No,No,Yes,100,Passenger,Female,13,No,No,0_to_16,Weekday,Night,6.0
54937,Tas,1,1989,Wednesday,20:20,Multiple,No,No,Yes,100,Driver,Male,18,No,No,17_to_25,Weekday,Night,6.0


In [5]:
df.columns

Index(['State', 'Month', 'Year', 'Day', 'Time', 'Crash Type',
       'Bus Involvement', 'Heavy Rigid Truck Involvement',
       'Articulated Truck Involvement', 'Speed Limit', 'Road User', 'Gender',
       'Age', 'Christmas Period', 'Easter Period', 'Age Group', 'Day Type',
       'Time of Day', 'Number Fatalities'],
      dtype='object')

In [6]:
# transforming data
df_str = df.astype(str).apply(lambda x: x.name + '=' + x) # TransactionEncoder can only handle string types
df_list = df_str.values.tolist() # Convert values in dataframe to list

te = TransactionEncoder()
array_te = te.fit(df_list).transform(df_list) # Convert values through one-hot encoding

transformed_df = pd.DataFrame(array_te, columns=te.columns_)
transformed_df.head()

Unnamed: 0,Age Group=0_to_16,Age Group=17_to_25,Age Group=26_to_39,Age Group=40_to_64,Age Group=65_to_74,Age Group=75_or_older,Age=0,Age=1,Age=10,Age=100,...,Year=2015,Year=2016,Year=2017,Year=2018,Year=2019,Year=2020,Year=2021,Year=2022,Year=2023,Year=2024
0,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [7]:
# Find the frequent itemsets
frequent_itemsets = apriori(transformed_df,min_support=0.2,use_colnames =True)
# Check the length of rules
frequent_itemsets['length']=frequent_itemsets['itemsets'].apply(lambda x: len(x))

frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.255265,(Age Group=17_to_25),1
1,0.232440,(Age Group=26_to_39),1
2,0.259269,(Age Group=40_to_64),1
3,0.899033,(Articulated Truck Involvement=No),1
4,0.981889,(Bus Involvement=No),1
...,...,...,...
2164,0.258942,"(Heavy Rigid Truck Involvement=No, Bus Involve...",8
2165,0.228253,"(Heavy Rigid Truck Involvement=No, Bus Involve...",8
2166,0.227780,"(Heavy Rigid Truck Involvement=No, Bus Involve...",8
2167,0.263274,"(Heavy Rigid Truck Involvement=No, Bus Involve...",8


In [8]:
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.1)
road_user_rules = rules[rules['consequents'].apply(
    lambda x: any('Road User=' in item for item in x)
)]
road_user_rules.sort_values(by=['lift','confidence'], ascending=False).head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
1512,"(Speed Limit=100, Bus Involvement=No)",(Road User=Driver),0.346421,0.456051,0.200022,0.577396,1.266077,1.0,0.042036,1.287136,0.32155,0.332014,0.223081,0.507996
1515,(Speed Limit=100),"(Bus Involvement=No, Road User=Driver)",0.35108,0.451119,0.200022,0.569732,1.262933,1.0,0.041643,1.275675,0.320829,0.332165,0.216101,0.506562
2436,"(Speed Limit=100, Easter Period=No)",(Road User=Driver),0.34886,0.456051,0.200386,0.574403,1.259513,1.0,0.041288,1.278083,0.316433,0.331477,0.217578,0.506898
276,(Speed Limit=100),(Road User=Driver),0.35108,0.456051,0.201478,0.57388,1.258368,1.0,0.041367,1.276516,0.316402,0.332662,0.216618,0.507834
2439,(Speed Limit=100),"(Easter Period=No, Road User=Driver)",0.35108,0.45374,0.200386,0.570769,1.257923,1.0,0.041087,1.27265,0.315969,0.331526,0.214238,0.506201
55881,"(Heavy Rigid Truck Involvement=No, Bus Involve...","(Crash Type=Single, Road User=Driver)",0.711134,0.238282,0.202297,0.284471,1.19384,1.0,0.032846,1.064552,0.562083,0.270769,0.060637,0.566726
35057,"(Number Fatalities=1.0, Heavy Rigid Truck Invo...","(Crash Type=Single, Road User=Driver)",0.715557,0.238282,0.203498,0.284392,1.193506,1.0,0.032994,1.064433,0.57,0.271208,0.060533,0.569207
55903,"(Number Fatalities=1.0, Heavy Rigid Truck Invo...","(Crash Type=Single, Easter Period=No, Road Use...",0.715557,0.236936,0.202297,0.282713,1.193205,1.0,0.032756,1.06382,0.569257,0.269659,0.059991,0.56826
40888,"(Number Fatalities=1.0, Heavy Rigid Truck Invo...","(Crash Type=Single, Road User=Driver)",0.723967,0.238282,0.202534,0.279756,1.17405,1.0,0.030025,1.057582,0.537065,0.266592,0.054447,0.564864
55912,"(Number Fatalities=1.0, Heavy Rigid Truck Invo...","(Crash Type=Single, Bus Involvement=No, Road U...",0.723967,0.238009,0.202297,0.279429,1.174024,1.0,0.029986,1.057481,0.536995,0.266293,0.054357,0.564691
