分析改用特定道路組合下為有設施的機率

In [1]:
import os
import numpy as np
import pandas as pd
import bnlearn as bn
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = ['Arial Unicode Ms']

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
analyze_path = os.path.join(parent_dir, "utils")

os.chdir(analyze_path)

from utils_behaviour import get_model, draw_bn_plotly, cpd_add_n, filter_cpd_for_hotspot, get_outlier
from config import category_value_map, feature_name_map, cause_mapping
from utils import read_data
from utils_eda import speed_bin

version = "V2"
computeddata = 'ComputedDataV3'

select_group_behaviour = [
    '號誌-號誌種類名稱',
    '號誌-號誌動作名稱',
    '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱',
    '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態子類別名稱',
    '車道劃分設施-分向設施子類別名稱',
    '道路型態子類別名稱',
    '速限-第1當事者',
    '道路類別-第1當事者-名稱',
    'youbike_100m_count',
    'cause_group']

In [2]:
combined_data = read_data()
# taiwan, grid_filter = read_taiwan_specific(read_grid=True)

def map_cause(cause):
    for category, causes in cause_mapping.items():
        if cause in causes:
            return category
    return "Unknown"

combined_data["cause_group"] = combined_data["肇因研判子類別名稱-主要"].apply(map_cause)

data = combined_data[select_group_behaviour].copy()
data['facility'] = data[['youbike_100m_count']].apply(lambda row: '1' if (row > 0).any() else '0', axis=1)
data.drop(columns=['youbike_100m_count'], inplace=True)

data['速限-第1當事者'] = speed_bin(data)

### Bayesian Network setting

In [4]:
parent = ['速限-第1當事者', '道路類別-第1當事者-名稱', '道路型態子類別名稱', 'facility', '號誌-號誌種類名稱']
child = 'cause_group'
result = ['事故類型及型態子類別名稱']

white_list = [
    ('速限-第1當事者', 'cause_group'),
    ('道路類別-第1當事者-名稱', 'cause_group'),
    ('道路型態子類別名稱', 'cause_group'),
    ('facility', 'cause_group'),
    ('號誌-號誌種類名稱', 'cause_group'),
]

black_list = []
# cause -> parent
black_list += [(c, p) for c in [child] for p in parent]
# result -> parent/cause
black_list += [(r, x) for r in result for x in (parent + [child] + result)]

In [5]:
model_all, model_param_all, model_independence_all = get_model(data, black_list=black_list, white_list=white_list)
# draw_bn_plotly(model_independence_all, layout_algo='spring', en=False, width=600, height=400, seed=42, iter=30)

[bnlearn] >Computing best DAG using [hc]
[bnlearn] >Set scoring type at [bic]
[bnlearn] >Filter edges based on black_list/white_list
[bnlearn] >Compute structure scores for model comparison (higher is better).
[bnlearn] >Parameter learning> Computing parameters using [bayes]
[bnlearn] >Converting [<class 'pgmpy.base.DAG.DAG'>] to BayesianNetwork model.
[bnlearn] >Converting adjmat to BayesianNetwork.
[bnlearn] >CPD of 號誌-號誌種類名稱:
+-----------------------------+-----------+
| 號誌-號誌種類名稱(無號誌)              | 0.591345  |
+-----------------------------+-----------+
| 號誌-號誌種類名稱(行車管制號誌)           | 0.258126  |
+-----------------------------+-----------+
| 號誌-號誌種類名稱(行車管制號誌(附設行人專用號誌)) | 0.101421  |
+-----------------------------+-----------+
| 號誌-號誌種類名稱(閃光號誌)             | 0.0491083 |
+-----------------------------+-----------+
[bnlearn] >CPD of cause_group:
+--------------------------------+-----+-------------------+
| facility                       | ... | facility(1)       |
+-----------------

## 因果分析
- cpd=True的時候考慮的是由因導果，在特定環境下(parent)，是故事該肇因的機率有多高(child)
- cpd=False時考慮的是由果朔因，在這些肇因中，最常出現的特徵是什麼

1. 一方面要考慮高風險的區域不一定常見，所以cpd=True時需要用資料總量來篩選，另一方面常見組合並不一定高風險
2. 在考慮因到果時才需要篩選數量，因為他們才是真正值得關注的高風險區

In [38]:
draw_bn_plotly(model_independence_all, layout_algo='spring', en=False, width=600, height=400, seed=42, iter=30)

In [45]:
CPDs = bn.print_CPD(model_param_all, verbose=0)
dfprob_cause = CPDs[child]
filtered = cpd_add_n(parent, child, dfprob_cause, data, threshold=0)

filtered_true = filtered[(filtered['n'] > 3000) & (filtered['p'] > 0.3) & (filtered['道路類別-第1當事者-名稱'] == '市區道路')]
filtered_true['total_n'] = (filtered_true.groupby(parent)['n'] .transform('sum'))
# check_sum must be around 1
# check_sum = (filtered_true.groupby(parent)['p'].sum())
# check_sum
filtered_true = filtered_true[['速限-第1當事者', '號誌-號誌種類名稱', '道路型態子類別名稱', 'facility', 'cause_group', 'p', 'n']]
filtered_true.sort_values(by=['速限-第1當事者', '號誌-號誌種類名稱', '道路型態子類別名稱', 'facility', 'cause_group', 'p', 'n'], ascending=False, inplace=True)

for col in filtered_true.columns:
    try:
        if col != '速限-第1當事者':
            filtered_true[col] = filtered_true[col].map(category_value_map[col])
    except KeyError:
        pass
filtered_true.rename(columns=feature_name_map, inplace=True)
filtered_true.to_excel(f'../{computeddata}/Behaviour/cpdtrue.xlsx', index=False)

In [None]:
CPDs = bn.print_CPD(model_param_all, verbose=0)
dfprob_cause = CPDs[child]
filtered = cpd_add_n(parent, child, dfprob_cause, data, threshold=0, cpd=False)

filtered_false = filtered[(filtered['n'] > 300) & (filtered['p_diagnostic'] > 0.05)]
filtered_false['total_n'] = (filtered_false.groupby(parent)['n'] .transform('sum'))
filtered_false = filtered_false[['cause_group'] + parent + ['p_diagnostic', 'n', 'total_n']]
# filtered_n.sort_values(by=['cause_group'] + parent, ascending=False, inplace=False)
filtered_false.sort_values(by=['cause_group', '速限-第1當事者', '號誌-號誌種類名稱', '道路型態子類別名稱', 'facility', 'p_diagnostic'], ascending=False, inplace=True)

for col in filtered_false.columns:
    try:
        if col != '速限-第1當事者':
            filtered_false[col] = filtered_false[col].map(category_value_map[col])
    except KeyError:
        pass
filtered_false.rename(columns=feature_name_map, inplace=True)
filtered_false.to_excel(f'../{computeddata}/Behaviour/cpdfalse.xlsx', index=False)

- risk 對應到： $\frac{N(E \cap C)}{N(C)}$
- frequency 對應到： $N(E \cap C)$

In [None]:
CPDs = bn.print_CPD(model_param_all, verbose=0)
dfprob_cause = CPDs[child]
filtered_true = cpd_add_n(parent, child, dfprob_cause, data, threshold=0)
filtered_false = cpd_add_n(parent, child, dfprob_cause, data, threshold=0, cpd=False)

df_risk = filtered_true.rename(columns={'p': 'p_risk', 'n': 'n_joint'})
df_diag = filtered_false.rename(columns={'p_diagnostic': 'p_freq'})

merge_keys = parent + [child]
df_diag_clean = df_diag[merge_keys + ['p_freq']]

df_diag_full = df_diag.rename(columns={'n': 'n_joint_diag'})
merged_df = pd.merge(df_risk, df_diag_full[merge_keys + ['p_freq', 'n_joint_diag']], on=merge_keys, how='outer')

merged_df['p_risk'] = merged_df['p_risk'].fillna(0)
merged_df['p_freq'] = merged_df['p_freq'].fillna(0)

def classify_danger(row):
    risk_high = row['p_risk'] > 0.05
    diag_high = row['p_freq'] > 0.05
    if risk_high and diag_high:
        return 'high risk, high frequency'
    elif risk_high and not diag_high:
        return 'high risk, low frequency'
    elif not risk_high and diag_high:
        return 'low risk, high frequency'
    else:
        return 'low risk, low frequency'

merged_df['Category'] = merged_df.apply(classify_danger, axis=1)
merged_df#.sort_values(by=['p_risk'], ascending=False).to_excel('Risk_Matrix.xlsx')

Unnamed: 0,cause_group,facility,號誌-號誌種類名稱,速限-第1當事者,道路型態子類別名稱,道路類別-第1當事者-名稱,p_risk,n_joint,n_joint.1,p_freq,n_joint_diag,Category
0,Decision,0,閃光號誌,30-39,四岔路,縣道,0.9985,42,42,0.0001,42,"high risk, low frequency"
1,Decision,1,行車管制號誌(附設行人專用號誌),50-59,三岔路,鄉道,0.9963,17,17,0.0000,17,"high risk, low frequency"
2,Decision,1,無號誌,40-49,四岔路,鄉道,0.9943,11,11,0.0000,11,"high risk, low frequency"
3,Decision,1,無號誌,30-39,四岔路,鄉道,0.9943,11,11,0.0000,11,"high risk, low frequency"
4,Decision,0,閃光號誌,60-69,多岔路,縣道,0.9937,10,10,0.0000,10,"high risk, low frequency"
...,...,...,...,...,...,...,...,...,...,...,...,...
110587,Other,1,無號誌,50-59,其他,市區道路,0.0000,0,0,0.0000,0,"low risk, low frequency"
110588,Other,1,無號誌,50-59,三岔路,村里道路,0.0000,0,0,0.0000,0,"low risk, low frequency"
110589,Vehicle,0,閃光號誌,40-49,三岔路,村里道路,0.0000,0,0,0.0000,0,"low risk, low frequency"
110590,Other,0,行車管制號誌(附設行人專用號誌),60-69,三岔路,市區道路,0.0000,0,0,0.0000,0,"low risk, low frequency"


In [102]:
lst = ['cause_group', 'facility', '號誌-號誌種類名稱', '速限-第1當事者', '道路型態子類別名稱', 'p_risk', 'p_freq', 'Category']
risk_matrix = merged_df[merged_df['Category'] == 'high risk, high frequency'][lst]
risk_matrix.sort_values(by=lst, ascending=False, inplace=True)
for col in risk_matrix.columns:
    try:
        if col != '速限-第1當事者':
            risk_matrix[col] = risk_matrix[col].map(category_value_map[col])
    except KeyError:
        pass

feature_name_map['cause_group'] = 'Cause Group'
risk_matrix.rename(columns=feature_name_map, inplace=True)
risk_matrix.to_excel('../ComputedDataV3/Behaviour/Risk_Matrix.xlsx', index=False)

### 去掉肇因重新分析
他的效果和BehaviourV2_hotspot相似，都是找出有設施的機率

In [None]:
parent = ['速限-第1當事者', '號誌-號誌種類名稱', '道路類別-第1當事者-名稱', '道路型態子類別名稱', 'cause_group']
child = 'facility'
result = ['事故類型及型態子類別名稱']

white_list = [
    ('速限-第1當事者', 'facility'),
    ('道路類別-第1當事者-名稱', 'facility'),
    ('道路型態子類別名稱', 'facility'),
    ('號誌-號誌種類名稱', 'facility'),
    ('cause_group', 'facility'),
]

black_list = []
# child -> parent
black_list += [(c, p) for c in child for p in parent]
# result -> parent/child
black_list += [(r, x) for r in result for x in (parent + [child] + result)]

model_all, model_param_all, model_independence_all = get_model(data, black_list=black_list, white_list=white_list)

[bnlearn] >Computing best DAG using [hc]
[bnlearn] >Set scoring type at [bic]
[bnlearn] >Filter edges based on black_list/white_list
[bnlearn] >Compute structure scores for model comparison (higher is better).
[bnlearn] >Parameter learning> Computing parameters using [bayes]
[bnlearn] >Converting [<class 'pgmpy.base.DAG.DAG'>] to BayesianNetwork model.
[bnlearn] >Converting adjmat to BayesianNetwork.
[bnlearn] >CPD of 號誌-號誌種類名稱:
+-----------------------------+-----------+
| 號誌-號誌種類名稱(無號誌)              | 0.591345  |
+-----------------------------+-----------+
| 號誌-號誌種類名稱(行車管制號誌)           | 0.258126  |
+-----------------------------+-----------+
| 號誌-號誌種類名稱(行車管制號誌(附設行人專用號誌)) | 0.101421  |
+-----------------------------+-----------+
| 號誌-號誌種類名稱(閃光號誌)             | 0.0491083 |
+-----------------------------+-----------+
[bnlearn] >CPD of facility:
+---------------+-----+----------------------+
| cause_group   | ... | cause_group(Vehicle) |
+---------------+-----+----------------------+
| 

In [None]:
draw_bn_plotly(model_independence_all, layout_algo='spring', en=False, width=600, height=400, seed=42, iter=30)

In [203]:
CPDs = bn.print_CPD(model_param_all, verbose=0)
dfprob_cause = CPDs[child]
filtered = cpd_add_n(parent, child, dfprob_cause, data, threshold=0, cpd=False)
filtered_n = filtered[filtered['n'] > 1000]
filtered_n.sort_values(by=['號誌-號誌種類名稱', '速限-第1當事者', '道路型態子類別名稱', 'facility', 'p_diagnostic'], ascending=False, inplace=True)
filtered_n['total_n'] = (filtered_n.groupby(parent)['n'] .transform('sum'))
filtered_n = filtered_n[parent + ['p_diagnostic', 'n', 'total_n']]
filtered_n.sort_values(by=['p_diagnostic'], ascending=False, inplace=True)
filtered_n

Unnamed: 0,速限-第1當事者,號誌-號誌種類名稱,道路類別-第1當事者-名稱,道路型態子類別名稱,cause_group,p_diagnostic,n,total_n
56424,50-59,無號誌,市區道路,直路,Decision,0.1148,14468,66917
58062,50-59,行車管制號誌,市區道路,四岔路,Decision,0.1134,14293,53744
1128,50-59,無號誌,市區道路,直路,Decision,0.0988,52449,66917
59790,50-59,行車管制號誌(附設行人專用號誌),市區道路,四岔路,Decision,0.0771,9716,27117
2766,50-59,行車管制號誌,市區道路,四岔路,Decision,0.0743,39451,53744
...,...,...,...,...,...,...,...,...
1074,50-59,無號誌,市區道路,多岔路,Decision,0.0021,1098,1098
2770,50-59,行車管制號誌,縣道,四岔路,Decision,0.0021,1109,1109
44328,50-59,行車管制號誌,市區道路,直路,Unidentified,0.0021,1114,1114
120,0-9,無號誌,市區道路,直路,Decision,0.0020,1047,1047


In [192]:
for col in filtered_n.columns:
    try:
        filtered_n[col] = filtered_n[col].map(category_value_map[col])
    except KeyError:
        pass
filtered_n.rename(columns=feature_name_map, inplace=False)

Unnamed: 0,Speed limit,Traffic signal type,Road category,Road type,cause_group,p_diagnostic,n,total_n
56424,50-59,No signal,Urban road,Straight road,Decision,0.1148,14468,66917
58062,50-59,Traffic control signal,Urban road,Crossroad,Decision,0.1134,14293,53744
1128,50-59,No signal,Urban road,Straight road,Decision,0.0988,52449,66917
59790,50-59,Traffic control signal (with pedestrian-only s...,Urban road,Crossroad,Decision,0.0771,9716,27117
2766,50-59,Traffic control signal,Urban road,Crossroad,Decision,0.0743,39451,53744
...,...,...,...,...,...,...,...,...
1074,50-59,No signal,Urban road,Multi-junction,Decision,0.0021,1098,1098
2770,50-59,Traffic control signal,County road,Crossroad,Decision,0.0021,1109,1109
44328,50-59,Traffic control signal,Urban road,Straight road,Unidentified,0.0021,1114,1114
120,0-9,No signal,Urban road,Straight road,Decision,0.0020,1047,1047
