分析改用特定道路組合下為有設施的機率

In [3]:
import os
import pandas as pd
import bnlearn as bn
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = ['Arial Unicode Ms']

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
analyze_path = os.path.join(parent_dir, "utils")

os.chdir(analyze_path)

from utils_behaviour import get_model, draw_bn_plotly, cpd_add_n, filter_cpd_for_hotspot, get_outlier
from config import category_value_map, feature_name_map, cause_mapping
from utils import read_data
from utils_eda import speed_bin

version = "V2"
computeddata = 'ComputedDataV3'

select_group_behaviour = [
    '號誌-號誌種類名稱',
    '號誌-號誌動作名稱',
    '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱',
    '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態子類別名稱',
    '車道劃分設施-分向設施子類別名稱',
    '道路型態子類別名稱',
    '速限-第1當事者',
    '道路類別-第1當事者-名稱',
    'youbike_100m_count',
    'cause_group']

In [4]:
combined_data = read_data()
# taiwan, grid_filter = read_taiwan_specific(read_grid=True)

def map_cause(cause):
    for category, causes in cause_mapping.items():
        if cause in causes:
            return category
    return "Unknown"

combined_data["cause_group"] = combined_data["肇因研判子類別名稱-主要"].apply(map_cause)

data = combined_data[select_group_behaviour].copy()
data['facility'] = data[['youbike_100m_count']].apply(lambda row: '1' if (row > 0).any() else '0', axis=1)
data.drop(columns=['youbike_100m_count'], inplace=True)

data['速限-第1當事者'] = speed_bin(data)

### Bayesian Network setting

In [6]:
parent = [
    '號誌-號誌種類名稱', '號誌-號誌動作名稱','車道劃分設施-分道設施-快車道或一般車道間名稱', '車道劃分設施-分道設施-快慢車道間名稱',
    '車道劃分設施-分道設施-路面邊線名稱', '車道劃分設施-分向設施子類別名稱', '道路型態子類別名稱',
    '速限-第1當事者', '道路類別-第1當事者-名稱', 'facility', 'cause_group'
    ]
cause = ['cause_group']
result = ['事故類型及型態子類別名稱']

white_list = [
    ('速限-第1當事者', 'cause_group'),
    ('道路類別-第1當事者-名稱', 'cause_group'),
    ('道路型態子類別名稱', 'cause_group'),
    ('facility', 'cause_group'),
    ('號誌-號誌種類名稱', 'cause_group'),
]

black_list = []
# cause -> parent
black_list += [(c, p) for c in cause for p in parent]
# result -> parent/cause
black_list += [(r, x) for r in result for x in (parent + cause + result)]

In [7]:
model_all, model_param_all, model_independence_all = get_model(data, black_list=black_list, white_list=white_list)
# draw_bn_plotly(model_independence_all, layout_algo='spring', en=False, width=600, height=400, seed=42, iter=30)

[bnlearn] >Computing best DAG using [hc]
[bnlearn] >Set scoring type at [bic]
[bnlearn] >Filter edges based on black_list/white_list
[bnlearn] >Compute structure scores for model comparison (higher is better).
[bnlearn] >Parameter learning> Computing parameters using [bayes]
[bnlearn] >Converting [<class 'pgmpy.base.DAG.DAG'>] to BayesianNetwork model.
[bnlearn] >Converting adjmat to BayesianNetwork.
[bnlearn] >CPD of 號誌-號誌種類名稱:
+-----------------------------+-----------+
| 號誌-號誌種類名稱(無號誌)              | 0.591345  |
+-----------------------------+-----------+
| 號誌-號誌種類名稱(行車管制號誌)           | 0.258126  |
+-----------------------------+-----------+
| 號誌-號誌種類名稱(行車管制號誌(附設行人專用號誌)) | 0.101421  |
+-----------------------------+-----------+
| 號誌-號誌種類名稱(閃光號誌)             | 0.0491083 |
+-----------------------------+-----------+
[bnlearn] >CPD of cause_group:
+--------------------------------+-----+-------------------+
| facility                       | ... | facility(1)       |
+-----------------

In [None]:
parent = ['速限-第1當事者', '道路類別-第1當事者-名稱', '道路型態子類別名稱', 'facility', '號誌-號誌種類名稱']
child = 'cause_group'

## 因果分析
- cpd=True的時候考慮的是由因導果，在特定環境下(parent)，是故事該肇因的機率有多高(child)
- cpd=False時考慮的是由果朔因，在這些肇因中，最常出現的特徵是什麼

1. 一方面要考慮高風險的區域不一定常見，所以cpd=True時需要用資料總量來篩選，另一方面常見組合並不一定高風險
2. 在考慮因到果時才需要篩選數量，因為他們才是真正值得關注的高風險區

In [48]:
def cpd_add_n(parent, child, model, data, cpd=True, threshold=50):
    """
    Add counts for the conditional probability distribution (CPD) or posterior distribution.
    """

    vb_all = parent.copy()
    vb_all.append(child)
    
    # 計算 N(Parent, Child), PD: P(child | parent)
    counts = (data.groupby(vb_all, dropna=False).size().reset_index(name='n_joint'))
    df = model.merge(counts, on=vb_all, how='left')
    df['n_joint'] = df['n_joint'].fillna(0)

    if cpd:
        df['n'] = df['n_joint']
    else:
        # Posterior: P(parent | child=v) -> data 已固定 child，counts 只用 parent
        total_child_counts = data.groupby(child).size().reset_index(name='n_child_total')
        df = df.merge(total_child_counts, on=child, how='left')
        df['n'] = df['n_joint']
        # P(Parent | Child) = N(Parent, Child) / N(Child)
        df['p_diagnostic'] = df['n_joint'] / df['n_child_total']
        df.rename(columns={'p': 'p_model_forward'}, inplace=True)

    filtered = df[df['n'] >= threshold].copy()

    cols_to_round = ['p', 'p_model_forward', 'p_diagnostic']
    for col in cols_to_round:
        if col in filtered.columns:
            filtered[col] = round(filtered[col], 4)
            
    filtered['n'] = filtered['n'].astype(int)

    sort_col = 'p_diagnostic' if (not cpd) else 'p'
    if 'p_model_forward' in filtered.columns: sort_col = 'p_model_forward'

    if sort_col in filtered.columns:
        filtered = filtered.sort_values(sort_col, ascending=False)
        
    return filtered

In [59]:
CPDs = bn.print_CPD(model_param_all, verbose=0)
dfprob_cause = CPDs[child]
filtered = cpd_add_n(parent, child, dfprob_cause, data, threshold=0)

filtered_n = filtered[filtered['n'] > 1000]
filtered_n.sort_values(by=['號誌-號誌種類名稱', '速限-第1當事者', '道路型態子類別名稱', 'facility', 'p'], ascending=False, inplace=True)
filtered_n['total_n'] = (filtered_n.groupby(parent)['n'] .transform('sum'))
# check_sum must be around 1
# check_sum = (filtered_n.groupby(parent)['p'].sum())
# check_sum
filtered_n = filtered_n[parent + ['cause_group', 'p', 'n', 'total_n']]
filtered_n.sort_values(by=parent + ['cause_group'], ascending=False, inplace=False)
filtered_n

Unnamed: 0,速限-第1當事者,道路類別-第1當事者-名稱,道路型態子類別名稱,facility,號誌-號誌種類名稱,cause_group,p,n,total_n
13134,50-59,市區道路,四岔路,1,閃光號誌,Decision,0.8877,1028,1028
6222,50-59,市區道路,四岔路,0,閃光號誌,Decision,0.9011,5428,5428
6224,50-59,村里道路,四岔路,0,閃光號誌,Decision,0.8765,1363,1363
6195,50-59,市區道路,三岔路,0,閃光號誌,Decision,0.8244,3898,3898
5934,30-39,市區道路,四岔路,0,閃光號誌,Decision,0.9494,2065,2065
...,...,...,...,...,...,...,...,...,...
723,30-39,市區道路,三岔路,0,無號誌,Decision,0.8554,11417,12562
725,30-39,村里道路,三岔路,0,無號誌,Decision,0.7923,2629,2629
14547,30-39,市區道路,三岔路,0,無號誌,Distraction,0.0858,1145,12562
550,110-119,國道,直路,0,無號誌,Decision,0.7001,1571,1571


In [64]:
CPDs = bn.print_CPD(model_param_all, verbose=0)
dfprob_cause = CPDs[child]
filtered = cpd_add_n(parent, child, dfprob_cause, data, threshold=0, cpd=False)

filtered_n = filtered[filtered['n'] > 1000]
filtered_n.sort_values(by=['號誌-號誌種類名稱', '速限-第1當事者', '道路型態子類別名稱', 'facility', 'p_diagnostic'], ascending=False, inplace=True)
filtered_n['total_n'] = (filtered_n.groupby(parent)['n'] .transform('sum'))
filtered_n = filtered_n[['cause_group'] + parent + ['p_diagnostic', 'n', 'total_n']]
filtered_n.sort_values(by=['cause_group'] + parent, ascending=False, inplace=False)
filtered_n.sort_values(by=['p_diagnostic'], ascending=False, inplace=True)
filtered_n

Unnamed: 0,cause_group,速限-第1當事者,道路類別-第1當事者-名稱,道路型態子類別名稱,facility,號誌-號誌種類名稱,p_diagnostic,n,total_n
70248,Posture,50-59,市區道路,直路,0,無號誌,0.2960,2121,80068
14952,Distraction,50-59,市區道路,直路,0,無號誌,0.1971,17225,80068
28776,Driver Impairment,50-59,市區道路,直路,0,無號誌,0.1654,1895,80068
84072,Unidentified,50-59,市區道路,直路,0,無號誌,0.1246,6378,80068
1128,Decision,50-59,市區道路,直路,0,無號誌,0.1070,52449,80068
...,...,...,...,...,...,...,...,...,...
11496,Decision,50-59,市區道路,直路,1,行車管制號誌(附設行人專用號誌),0.0023,1115,1115
1074,Decision,50-59,市區道路,多岔路,0,無號誌,0.0022,1098,1098
13134,Decision,50-59,市區道路,四岔路,1,閃光號誌,0.0021,1028,1028
948,Decision,40-49,市區道路,彎曲路及附近,0,無號誌,0.0021,1034,1034


### 找出「有設施」與「無設施」在各種道路組合下的差異

In [20]:
import numpy as np
design = ['道路類別-第1當事者-名稱', '號誌-號誌種類名稱', '速限-第1當事者', '道路型態子類別名稱']
df = filtered_n.copy()
df['facility'] = df['facility'].astype(int)

# 對「道路組合 + facility」把 n 加總
g = (df.groupby(design + ['facility'])['n']
       .sum()
       .unstack('facility', fill_value=0)
       .rename(columns={0: 'facility_0', 1: 'facility_1'})
    )

# P(facility=1 | 道路組合)
g['total_n'] = g['facility_0'] + g['facility_1']
g['Facility_P'] = g['facility_1'] / (g['total_n']).replace(0, np.nan)

g = g.reset_index()
g = g[(g['total_n'] > 0) &
      (g['Facility_P'] > 0.1)]
g.sort_values(by='Facility_P', ascending=False, inplace=True)
g_hotspot = filter_cpd_for_hotspot(g)

In [21]:
g_hotspot

facility,道路類別-第1當事者-名稱,號誌-號誌種類名稱,速限-第1當事者,道路型態子類別名稱,facility_0,facility_1,total_n,Facility_P
592,市區道路,行車管制號誌,10-19,橋樑,0,1,1,1.000000
599,市區道路,行車管制號誌,20-29,坡路,0,1,1,1.000000
535,市區道路,無號誌,50-59,廣場,0,1,1,1.000000
744,市區道路,閃光號誌,20-29,圓環,0,1,1,1.000000
598,市區道路,行車管制號誌,20-29,圓環,0,1,1,1.000000
...,...,...,...,...,...,...,...,...
733,市區道路,閃光號誌,0-9,四岔路,47,6,53,0.113208
611,市區道路,行車管制號誌,30-39,橋樑,8,1,9,0.111111
518,市區道路,無號誌,40-49,坡路,214,25,239,0.104603
536,市區道路,無號誌,50-59,彎曲路及附近,4208,483,4691,0.102963


In [22]:
g_hotspot = g_hotspot[['號誌-號誌種類名稱', '速限-第1當事者', '道路型態子類別名稱', 'Facility_P', 'total_n']]
g_hotspot.sort_values(by=['號誌-號誌種類名稱', '速限-第1當事者', 'Facility_P'], ascending=False, inplace=True)
g_hotspot['Facility_P'] = g_hotspot['Facility_P'].round(3)
# g_hotspot = g_hotspot[g_hotspot['total_n']>100]

for col in ['號誌-號誌種類名稱', '道路型態子類別名稱']:
    g_hotspot[col] = g_hotspot[col].map(category_value_map[col])
g_hotspot.rename(columns=feature_name_map, inplace=True)

# g_hotspot.to_excel(f'../{computeddata}/Behaviour/Behaviour_hotspot.xlsx', index=False)

In [23]:
g_hotspot[(g_hotspot['Road type (subcategory)'] == 'Roundabout') &
          (g_hotspot['total_n'] > 10)
          ].sort_values(by='Facility_P', ascending=False)

facility,Traffic signal type,Speed limit (Party 1),Road type (subcategory),Facility_P,total_n
757,Flashing signal,40-49,Roundabout,1.0,20
531,No signal,50-59,Roundabout,0.462,532
706,Traffic control signal (with pedestrian-only s...,50-59,Roundabout,0.4,50
618,Traffic control signal,40-49,Roundabout,0.333,12
501,No signal,30-39,Roundabout,0.322,59
630,Traffic control signal,50-59,Roundabout,0.266,94
516,No signal,40-49,Roundabout,0.135,192


### 去掉肇因重新分析
他的效果和BehaviourV2_hotspot相似，都是找出有設施的機率

In [None]:
parent = [
    '號誌-號誌種類名稱', '號誌-號誌動作名稱','車道劃分設施-分道設施-快車道或一般車道間名稱', '車道劃分設施-分道設施-快慢車道間名稱',
    '車道劃分設施-分道設施-路面邊線名稱', '車道劃分設施-分向設施子類別名稱', '道路型態子類別名稱',
    '速限-第1當事者', '道路類別-第1當事者-名稱', 'facility', 'COUNTYNAME'
    ]
cause = ['facility']
result = ['事故類型及型態子類別名稱']

white_list = [
    ('速限-第1當事者', 'facility'),
    ('道路類別-第1當事者-名稱', 'facility'),
    ('道路型態子類別名稱', 'facility'),
    ('號誌-號誌種類名稱', 'facility'),
]

black_list = []
# cause -> parent
black_list += [(c, p) for c in cause for p in parent]
# result -> parent/cause
black_list += [(r, x) for r in result for x in (parent + cause + result)]

model_all, model_param_all, model_independence_all = get_model(data, black_list=black_list, white_list=white_list)

In [None]:
parent = ['速限-第1當事者', '道路類別-第1當事者-名稱', '道路型態子類別名稱', '號誌-號誌種類名稱']
child = 'facility'
CPDs = bn.print_CPD(model_param_all)
dfprob_cause = CPDs[child]
filtered = cpd_add_n(parent, child, dfprob_cause, data, threshold=0)

hotspot = filter_cpd_for_hotspot(filtered)
hotspot = hotspot[(hotspot['p'] > 0.1) & (hotspot['n'] > 100)]

In [None]:
filtered_0 = hotspot[(hotspot['facility'] == '0')].sort_values(by=['p'], ascending=False)#.head(10)
filtered_1 = hotspot[(hotspot['facility'] == '1')].sort_values(by=['p'], ascending=False)#.head(10)
full_filter = pd.concat([filtered_0, filtered_1], axis=0)
full_filter.sort_values(by=['facility', '號誌-號誌種類名稱', '速限-第1當事者', 'p'], ascending=False, inplace=True)
full_filter
# for col in ['道路類別-第1當事者-名稱', '道路型態子類別名稱', '號誌-號誌種類名稱']:
#     full_filter[col] = full_filter[col].map(category_value_map[col])
# full_filter.rename(columns=feature_name_map, inplace=True)
# full_filter.to_excel(f'../{computeddata}/Behaviour/Origin_outlierV2.xlsx', index=False)