分析改用特定道路組合下為有設施的機率

In [3]:
import os
import pandas as pd
import bnlearn as bn
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = ['Arial Unicode Ms']

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
analyze_path = os.path.join(parent_dir, "utils")

os.chdir(analyze_path)

from utils_behaviour import get_model, draw_bn_plotly, cpd_add_n, filter_cpd_for_hotspot, get_outlier
from config import category_value_map, feature_name_map, cause_mapping
from utils import read_data
from utils_eda import speed_bin

version = "V2"
computeddata = 'ComputedDataV3'

select_group_behaviour = [
    '號誌-號誌種類名稱',
    '號誌-號誌動作名稱',
    '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱',
    '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態子類別名稱',
    '車道劃分設施-分向設施子類別名稱',
    '道路型態子類別名稱',
    '速限-第1當事者',
    '道路類別-第1當事者-名稱',
    'youbike_100m_count',
    'cause_group']

In [4]:
combined_data = read_data()
# taiwan, grid_filter = read_taiwan_specific(read_grid=True)

def map_cause(cause):
    for category, causes in cause_mapping.items():
        if cause in causes:
            return category
    return "Unknown"

combined_data["cause_group"] = combined_data["肇因研判子類別名稱-主要"].apply(map_cause)

data = combined_data[select_group_behaviour].copy()
data['facility'] = data[['youbike_100m_count']].apply(lambda row: '1' if (row > 0).any() else '0', axis=1)
data.drop(columns=['youbike_100m_count'], inplace=True)

data['速限-第1當事者'] = speed_bin(data)

### Bayesian Network setting

In [6]:
parent = [
    '號誌-號誌種類名稱', '號誌-號誌動作名稱','車道劃分設施-分道設施-快車道或一般車道間名稱', '車道劃分設施-分道設施-快慢車道間名稱',
    '車道劃分設施-分道設施-路面邊線名稱', '車道劃分設施-分向設施子類別名稱', '道路型態子類別名稱',
    '速限-第1當事者', '道路類別-第1當事者-名稱', 'facility', 'cause_group'
    ]
cause = ['cause_group']
result = ['事故類型及型態子類別名稱']

white_list = [
    ('速限-第1當事者', 'cause_group'),
    ('道路類別-第1當事者-名稱', 'cause_group'),
    ('道路型態子類別名稱', 'cause_group'),
    ('facility', 'cause_group'),
    ('號誌-號誌種類名稱', 'cause_group'),
]

black_list = []
# cause -> parent
black_list += [(c, p) for c in cause for p in parent]
# result -> parent/cause
black_list += [(r, x) for r in result for x in (parent + cause + result)]

In [7]:
model_all, model_param_all, model_independence_all = get_model(data, black_list=black_list, white_list=white_list)
# draw_bn_plotly(model_independence_all, layout_algo='spring', en=False, width=600, height=400, seed=42, iter=30)

[bnlearn] >Computing best DAG using [hc]
[bnlearn] >Set scoring type at [bic]
[bnlearn] >Filter edges based on black_list/white_list
[bnlearn] >Compute structure scores for model comparison (higher is better).
[bnlearn] >Parameter learning> Computing parameters using [bayes]
[bnlearn] >Converting [<class 'pgmpy.base.DAG.DAG'>] to BayesianNetwork model.
[bnlearn] >Converting adjmat to BayesianNetwork.
[bnlearn] >CPD of 號誌-號誌種類名稱:
+-----------------------------+-----------+
| 號誌-號誌種類名稱(無號誌)              | 0.591345  |
+-----------------------------+-----------+
| 號誌-號誌種類名稱(行車管制號誌)           | 0.258126  |
+-----------------------------+-----------+
| 號誌-號誌種類名稱(行車管制號誌(附設行人專用號誌)) | 0.101421  |
+-----------------------------+-----------+
| 號誌-號誌種類名稱(閃光號誌)             | 0.0491083 |
+-----------------------------+-----------+
[bnlearn] >CPD of cause_group:
+--------------------------------+-----+-------------------+
| facility                       | ... | facility(1)       |
+-----------------

In [27]:
parent = ['速限-第1當事者', '道路類別-第1當事者-名稱', '道路型態子類別名稱', 'facility', '號誌-號誌種類名稱']
child = 'cause_group'
evidence_v = list(data['cause_group'].value_counts().index)

In [28]:
CPDs = bn.print_CPD(model_param_all)
dfprob_cause = CPDs[child]
filtered = cpd_add_n(parent, child, dfprob_cause, data, threshold=0)

[bnlearn] >[CPD] >[Node 號誌-號誌種類名稱]:
+-----------------------------+-----------+
| 號誌-號誌種類名稱(無號誌)              | 0.591345  |
+-----------------------------+-----------+
| 號誌-號誌種類名稱(行車管制號誌)           | 0.258126  |
+-----------------------------+-----------+
| 號誌-號誌種類名稱(行車管制號誌(附設行人專用號誌)) | 0.101421  |
+-----------------------------+-----------+
| 號誌-號誌種類名稱(閃光號誌)             | 0.0491083 |
+-----------------------------+-----------+
[bnlearn] >[CPD] >[Node cause_group]:
+--------------------------------+-----+-------------------+
| facility                       | ... | facility(1)       |
+--------------------------------+-----+-------------------+
| 號誌-號誌種類名稱                      | ... | 號誌-號誌種類名稱(閃光號誌)   |
+--------------------------------+-----+-------------------+
| 速限-第1當事者                       | ... | 速限-第1當事者(90-99)   |
+--------------------------------+-----+-------------------+
| 道路型態子類別名稱                      | ... | 道路型態子類別名稱(高架道路)   |
+--------------------------------+-----+--

In [11]:
filtered_n = filtered[filtered['n'] > 0]
filtered_n.sort_values(by=['號誌-號誌種類名稱', '速限-第1當事者', '道路型態子類別名稱', 'facility', 'p'], ascending=False, inplace=True)

filtered_n['total_n'] = (
    filtered_n
    .groupby(parent)['n']
    .transform('sum')
)

filtered_n.sort_values(by=['total_n', 'p'], ascending=False, inplace=True)

# check_sum must be around 1
# check_sum = (filtered_n.groupby(parent)['p'].sum())
# check_sum

In [12]:
filtered_n = filtered_n[parent + ['cause_group', 'p', 'n', 'total_n']]
filtered_n.sort_values(by=parent + ['cause_group'], ascending=False, inplace=False)
filtered_n

Unnamed: 0,速限-第1當事者,道路類別-第1當事者-名稱,道路型態子類別名稱,facility,號誌-號誌種類名稱,cause_group,p,n,total_n
1128,50-59,市區道路,直路,0,無號誌,Decision,0.6431,52449,81555
14952,50-59,市區道路,直路,0,無號誌,Distraction,0.2112,17225,81555
84072,50-59,市區道路,直路,0,無號誌,Unidentified,0.0782,6378,81555
70248,50-59,市區道路,直路,0,無號誌,Posture,0.0260,2121,81555
28776,50-59,市區道路,直路,0,無號誌,Driver Impairment,0.0232,1895,81555
...,...,...,...,...,...,...,...,...,...
6939,0-9,其他,四岔路,1,無號誌,Decision,0.9410,1,1
48409,0-9,縣道,其他,1,無號誌,Environmental,0.9410,1,1
25,0-9,縣道,其他,0,無號誌,Decision,0.9410,1,1
26,0-9,鄉道,其他,0,無號誌,Decision,0.9410,1,1


### 找出「有設施」與「無設施」在各種道路組合下的差異

In [20]:
import numpy as np
design = ['道路類別-第1當事者-名稱', '號誌-號誌種類名稱', '速限-第1當事者', '道路型態子類別名稱']
df = filtered_n.copy()
df['facility'] = df['facility'].astype(int)

# 對「道路組合 + facility」把 n 加總
g = (df.groupby(design + ['facility'])['n']
       .sum()
       .unstack('facility', fill_value=0)
       .rename(columns={0: 'facility_0', 1: 'facility_1'})
    )

# P(facility=1 | 道路組合)
g['total_n'] = g['facility_0'] + g['facility_1']
g['Facility_P'] = g['facility_1'] / (g['total_n']).replace(0, np.nan)

g = g.reset_index()
g = g[(g['total_n'] > 0) &
      (g['Facility_P'] > 0.1)]
g.sort_values(by='Facility_P', ascending=False, inplace=True)
g_hotspot = filter_cpd_for_hotspot(g)

In [21]:
g_hotspot

facility,道路類別-第1當事者-名稱,號誌-號誌種類名稱,速限-第1當事者,道路型態子類別名稱,facility_0,facility_1,total_n,Facility_P
592,市區道路,行車管制號誌,10-19,橋樑,0,1,1,1.000000
599,市區道路,行車管制號誌,20-29,坡路,0,1,1,1.000000
535,市區道路,無號誌,50-59,廣場,0,1,1,1.000000
744,市區道路,閃光號誌,20-29,圓環,0,1,1,1.000000
598,市區道路,行車管制號誌,20-29,圓環,0,1,1,1.000000
...,...,...,...,...,...,...,...,...
733,市區道路,閃光號誌,0-9,四岔路,47,6,53,0.113208
611,市區道路,行車管制號誌,30-39,橋樑,8,1,9,0.111111
518,市區道路,無號誌,40-49,坡路,214,25,239,0.104603
536,市區道路,無號誌,50-59,彎曲路及附近,4208,483,4691,0.102963


In [22]:
g_hotspot = g_hotspot[['號誌-號誌種類名稱', '速限-第1當事者', '道路型態子類別名稱', 'Facility_P', 'total_n']]
g_hotspot.sort_values(by=['號誌-號誌種類名稱', '速限-第1當事者', 'Facility_P'], ascending=False, inplace=True)
g_hotspot['Facility_P'] = g_hotspot['Facility_P'].round(3)
# g_hotspot = g_hotspot[g_hotspot['total_n']>100]

for col in ['號誌-號誌種類名稱', '道路型態子類別名稱']:
    g_hotspot[col] = g_hotspot[col].map(category_value_map[col])
g_hotspot.rename(columns=feature_name_map, inplace=True)

# g_hotspot.to_excel(f'../{computeddata}/Behaviour/Behaviour_hotspot.xlsx', index=False)

In [23]:
g_hotspot[(g_hotspot['Road type (subcategory)'] == 'Roundabout') &
          (g_hotspot['total_n'] > 10)
          ].sort_values(by='Facility_P', ascending=False)

facility,Traffic signal type,Speed limit (Party 1),Road type (subcategory),Facility_P,total_n
757,Flashing signal,40-49,Roundabout,1.0,20
531,No signal,50-59,Roundabout,0.462,532
706,Traffic control signal (with pedestrian-only s...,50-59,Roundabout,0.4,50
618,Traffic control signal,40-49,Roundabout,0.333,12
501,No signal,30-39,Roundabout,0.322,59
630,Traffic control signal,50-59,Roundabout,0.266,94
516,No signal,40-49,Roundabout,0.135,192


### 去掉肇因重新分析
他的效果和BehaviourV2_hotspot相似，都是找出有設施的機率

In [None]:
parent = [
    '號誌-號誌種類名稱', '號誌-號誌動作名稱','車道劃分設施-分道設施-快車道或一般車道間名稱', '車道劃分設施-分道設施-快慢車道間名稱',
    '車道劃分設施-分道設施-路面邊線名稱', '車道劃分設施-分向設施子類別名稱', '道路型態子類別名稱',
    '速限-第1當事者', '道路類別-第1當事者-名稱', 'facility', 'COUNTYNAME'
    ]
cause = ['facility']
result = ['事故類型及型態子類別名稱']

white_list = [
    ('速限-第1當事者', 'facility'),
    ('道路類別-第1當事者-名稱', 'facility'),
    ('道路型態子類別名稱', 'facility'),
    ('號誌-號誌種類名稱', 'facility'),
]

black_list = []
# cause -> parent
black_list += [(c, p) for c in cause for p in parent]
# result -> parent/cause
black_list += [(r, x) for r in result for x in (parent + cause + result)]

model_all, model_param_all, model_independence_all = get_model(data, black_list=black_list, white_list=white_list)

In [None]:
parent = ['速限-第1當事者', '道路類別-第1當事者-名稱', '道路型態子類別名稱', '號誌-號誌種類名稱']
child = 'facility'
evidence_v = list(data['facility'].value_counts().head(5).index)
CPDs = bn.print_CPD(model_param_all)
dfprob_cause = CPDs[child]
filtered = cpd_add_n(parent, child, dfprob_cause, data, threshold=0)

hotspot = filter_cpd_for_hotspot(filtered)
hotspot = hotspot[(hotspot['p'] > 0.1) & (hotspot['n'] > 100)]

In [None]:
filtered_0 = hotspot[(hotspot['facility'] == '0')].sort_values(by=['p'], ascending=False)#.head(10)
filtered_1 = hotspot[(hotspot['facility'] == '1')].sort_values(by=['p'], ascending=False)#.head(10)
full_filter = pd.concat([filtered_0, filtered_1], axis=0)
full_filter.sort_values(by=['facility', '號誌-號誌種類名稱', '速限-第1當事者', 'p'], ascending=False, inplace=True)
full_filter
# for col in ['道路類別-第1當事者-名稱', '道路型態子類別名稱', '號誌-號誌種類名稱']:
#     full_filter[col] = full_filter[col].map(category_value_map[col])
# full_filter.rename(columns=feature_name_map, inplace=True)
# full_filter.to_excel(f'../{computeddata}/Behaviour/Origin_outlierV2.xlsx', index=False)